# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
%matplotlib inline

In [2]:
# Set the file paths
inflation_data = Path("Resources/annual_inflation.csv")
mortgage_rate_data = Path("Resources/mortgage30yr.csv")
median_income_data = Path("Resources/median_h_inc.csv")
median_house_cost_data = Path("Resources/median_house_price.csv")
us_debt_data = Path("Resources/us_debt.csv")

# Paths for common commodities
chicken_lb_data = Path("Resources/chicken_lb.csv")
coffe_lb_data = Path("Resources/coffee_lb.csv")
electricity_data = Path("Resources/electricity_per_kwh.csv")
gasoline_data = Path("Resources/gasoline_gal.csv")
bread_data = Path("Resources/white_bread_lb.csv")


## Read in as Data Frames

In [3]:
# Read inflation data
inflation_df = pd.read_csv(inflation_data)
inflation_df.head()

Unnamed: 0,Year,Annual
0,1980,12.4
1,1981,10.4
2,1982,7.4
3,1983,4.0
4,1984,5.0


In [4]:
inflation_df.set_index('Year', inplace=True)
inflation_df.head()

Unnamed: 0_level_0,Annual
Year,Unnamed: 1_level_1
1980,12.4
1981,10.4
1982,7.4
1983,4.0
1984,5.0


In [5]:
#count nulls
inflation_df.isnull().sum()

Annual    0
dtype: int64

In [6]:
# Read morgage_rate_data
mortgage_df = pd.read_csv(mortgage_rate_data)
mortgage_df.head()

Unnamed: 0,DATE,MORTGAGE30US
0,1971-04-02,7.33
1,1971-04-09,7.31
2,1971-04-16,7.31
3,1971-04-23,7.31
4,1971-04-30,7.29


In [7]:
type(mortgage_df['DATE'][0])

str

In [8]:
mortgage_df["DATE"] = pd.to_datetime(mortgage_df["DATE"]).dt.strftime("%Y/%m/%d")

In [9]:
mortgage_df['DATE'] = pd.to_datetime(mortgage_df['DATE'], format="%Y/%m/%d")

In [10]:
mortgage_df['Year'] = mortgage_df['DATE'].dt.year
mortgage_df.head()

Unnamed: 0,DATE,MORTGAGE30US,Year
0,1971-04-02,7.33,1971
1,1971-04-09,7.31,1971
2,1971-04-16,7.31,1971
3,1971-04-23,7.31,1971
4,1971-04-30,7.29,1971


In [11]:
type(mortgage_df['DATE'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [12]:
mortgage_df.drop(columns=['DATE'], inplace=True)
mortgage_df.head()

Unnamed: 0,MORTGAGE30US,Year
0,7.33,1971
1,7.31,1971
2,7.31,1971
3,7.31,1971
4,7.29,1971


In [13]:
# setting Year as index after dropping date
mortgage_df.set_index('Year', inplace=True)
mortgage_df.head()

Unnamed: 0_level_0,MORTGAGE30US
Year,Unnamed: 1_level_1
1971,7.33
1971,7.31
1971,7.31
1971,7.31
1971,7.29


In [14]:
# convert to string, then to date, then .dt.year, groupby or log by year
# of 'Year' column. Specify .mean() 
mort_avg_df = mortgage_df.groupby('Year')['MORTGAGE30US'].mean()
mort_avg_df.head()

Year
1971    7.541750
1972    7.383269
1973    8.044808
1974    9.187115
1975    9.047115
Name: MORTGAGE30US, dtype: float64

In [15]:
# Turning a series into a DataFrame
mort_avg_df = pd.DataFrame(mort_avg_df)
mort_avg_df.head()

Unnamed: 0_level_0,MORTGAGE30US
Year,Unnamed: 1_level_1
1971,7.54175
1972,7.383269
1973,8.044808
1974,9.187115
1975,9.047115


In [16]:
mort_avg_df.isnull().sum()

MORTGAGE30US    0
dtype: int64

In [17]:
#Read median_income_data
median_income_df = pd.read_csv(median_income_data)
median_income_df.head()

Unnamed: 0,DATE,MEHOINUSA646N
0,1984-01-01,22420
1,1985-01-01,23620
2,1986-01-01,24900
3,1987-01-01,26060
4,1988-01-01,27230


In [18]:
median_income_df.isnull().sum()

DATE             0
MEHOINUSA646N    0
dtype: int64

In [19]:
median_income_df["DATE"] = pd.to_datetime(median_income_df["DATE"]).dt.strftime("%Y/%m/%d")

In [20]:
median_income_df['DATE'] = pd.to_datetime(median_income_df['DATE'], format="%Y/%m/%d")

In [21]:
median_income_df['Year'] = median_income_df['DATE'].dt.year
median_income_df.head()

Unnamed: 0,DATE,MEHOINUSA646N,Year
0,1984-01-01,22420,1984
1,1985-01-01,23620,1985
2,1986-01-01,24900,1986
3,1987-01-01,26060,1987
4,1988-01-01,27230,1988


In [22]:
median_income_df.drop(columns=['DATE'], inplace=True)
median_income_df.head()

Unnamed: 0,MEHOINUSA646N,Year
0,22420,1984
1,23620,1985
2,24900,1986
3,26060,1987
4,27230,1988


In [23]:
median_income_df.set_index('Year', inplace=True)
median_income_df.head()

Unnamed: 0_level_0,MEHOINUSA646N
Year,Unnamed: 1_level_1
1984,22420
1985,23620
1986,24900
1987,26060
1988,27230


In [24]:
#Read median_house_cost_data
median_house_df = pd.read_csv(median_house_cost_data)
median_house_df.head()

Unnamed: 0,DATE,MSPUS
0,1963-01-01,17800.0
1,1963-04-01,18000.0
2,1963-07-01,17900.0
3,1963-10-01,18500.0
4,1964-01-01,18500.0


In [25]:
median_house_df.isnull().sum()

DATE     0
MSPUS    0
dtype: int64

In [26]:
median_house_df["DATE"] = pd.to_datetime(median_house_df["DATE"]).dt.strftime("%Y/%m/%d")

In [27]:
median_house_df['DATE'] = pd.to_datetime(median_house_df['DATE'], format="%Y/%m/%d")

In [28]:
median_house_df['Year'] = median_house_df['DATE'].dt.year
median_house_df.head()

Unnamed: 0,DATE,MSPUS,Year
0,1963-01-01,17800.0,1963
1,1963-04-01,18000.0,1963
2,1963-07-01,17900.0,1963
3,1963-10-01,18500.0,1963
4,1964-01-01,18500.0,1964


In [29]:
median_house_df.drop(columns=['DATE'], inplace=True)
median_house_df.head()

Unnamed: 0,MSPUS,Year
0,17800.0,1963
1,18000.0,1963
2,17900.0,1963
3,18500.0,1963
4,18500.0,1964


In [30]:
# Setting Year as index
median_house_df.set_index('Year', inplace=True)
median_house_df.head()

Unnamed: 0_level_0,MSPUS
Year,Unnamed: 1_level_1
1963,17800.0
1963,18000.0
1963,17900.0
1963,18500.0
1964,18500.0


In [31]:
# Group same-year entries and mean them
median_house_df = median_house_df.groupby('Year')['MSPUS'].mean()
median_house_df.head()

Year
1963    18050.0
1964    18925.0
1965    20125.0
1966    21500.0
1967    22750.0
Name: MSPUS, dtype: float64

In [32]:
# Converting a series into a DataFrame
median_house_df = pd.DataFrame(median_house_df)
median_house_df.head()

Unnamed: 0_level_0,MSPUS
Year,Unnamed: 1_level_1
1963,18050.0
1964,18925.0
1965,20125.0
1966,21500.0
1967,22750.0


In [33]:
#Read us_debt_data
us_debt_df = pd.read_csv(us_debt_data)
us_debt_df.head()

Unnamed: 0,Record Date,Debt Outstanding Amount,Source Line Number,Fiscal Year,Fiscal Quarter Number,Calendar Year,Calendar Quarter Number,Calendar Month Number,Calendar Day Number
0,2023-09-30,33167330000000.0,1,2023,4,2023,3,9,30
1,2022-09-30,30928910000000.0,1,2022,4,2022,3,9,30
2,2021-09-30,28428920000000.0,1,2021,4,2021,3,9,30
3,2020-09-30,26945390000000.0,1,2020,4,2020,3,9,30
4,2019-09-30,22719400000000.0,1,2019,4,2019,3,9,30


In [34]:
# Renaming 'Calendar Year' to 'Year' column
us_debt_df.rename(columns={'Calendar Year': "Year"}, inplace=True)

# Dropping columns we aren't using
us_debt_df.drop(columns=['Record Date', 'Source Line Number',
       'Fiscal Year', 'Fiscal Quarter Number',
       'Calendar Quarter Number', 'Calendar Month Number',
       'Calendar Day Number'], inplace=True)
us_debt_df.head()

Unnamed: 0,Debt Outstanding Amount,Year
0,33167330000000.0,2023
1,30928910000000.0,2022
2,28428920000000.0,2021
3,26945390000000.0,2020
4,22719400000000.0,2019


In [35]:
# Setting 'Year' as index
us_debt_df.set_index('Year', inplace=True)
us_debt_df.head()

Unnamed: 0_level_0,Debt Outstanding Amount
Year,Unnamed: 1_level_1
2023,33167330000000.0
2022,30928910000000.0
2021,28428920000000.0
2020,26945390000000.0
2019,22719400000000.0


In [36]:
# Check for null values
us_debt_df.isnull().sum()

Debt Outstanding Amount    0
dtype: int64

In [37]:
#Read chicken_lb_data
chicken_df = pd.read_csv(chicken_lb_data)
chicken_df.tail()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
39,2019,1.473,1.486,1.468,1.463,1.479,1.59,1.562,1.514,1.487,1.538,1.43,1.45
40,2020,1.409,1.362,1.4,1.571,,1.747,1.712,1.609,1.542,1.58,1.639,1.621
41,2021,1.595,1.583,1.543,1.515,1.486,1.474,1.435,1.472,1.504,1.523,1.583,1.606
42,2022,1.622,1.632,1.724,1.794,1.824,1.826,1.88,1.879,1.891,1.863,1.843,1.83
43,2023,1.855,1.894,1.868,1.873,1.92,1.953,1.891,1.958,1.901,1.926,1.976,1.955


In [38]:
chicken_df.isnull().sum()

Year    0
Jan     0
Feb     0
Mar     0
Apr     0
May     1
Jun     0
Jul     0
Aug     0
Sep     0
Oct     0
Nov     0
Dec     0
dtype: int64

In [39]:
chicken_df.set_index('Year', inplace=True)

In [40]:
chicken_df.tail()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019,1.473,1.486,1.468,1.463,1.479,1.59,1.562,1.514,1.487,1.538,1.43,1.45
2020,1.409,1.362,1.4,1.571,,1.747,1.712,1.609,1.542,1.58,1.639,1.621
2021,1.595,1.583,1.543,1.515,1.486,1.474,1.435,1.472,1.504,1.523,1.583,1.606
2022,1.622,1.632,1.724,1.794,1.824,1.826,1.88,1.879,1.891,1.863,1.843,1.83
2023,1.855,1.894,1.868,1.873,1.92,1.953,1.891,1.958,1.901,1.926,1.976,1.955


In [41]:
chicken_df = chicken_df.apply(lambda row: row.fillna(row.mean()).round(3), axis=1)
chicken_df.tail()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019,1.473,1.486,1.468,1.463,1.479,1.59,1.562,1.514,1.487,1.538,1.43,1.45
2020,1.409,1.362,1.4,1.571,1.563,1.747,1.712,1.609,1.542,1.58,1.639,1.621
2021,1.595,1.583,1.543,1.515,1.486,1.474,1.435,1.472,1.504,1.523,1.583,1.606
2022,1.622,1.632,1.724,1.794,1.824,1.826,1.88,1.879,1.891,1.863,1.843,1.83
2023,1.855,1.894,1.868,1.873,1.92,1.953,1.891,1.958,1.901,1.926,1.976,1.955


In [42]:
chicken_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    0
Oct    0
Nov    0
Dec    0
dtype: int64

In [43]:
# Taking the average of each year by averaging the rows
chicken_df['Average Chicken Price per Pound'] = chicken_df.mean(axis=1).round(3)
chicken_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Average Chicken Price per Pound
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,0.699,0.673,0.655,0.638,0.628,0.64,0.71,0.751,0.791,0.792,0.766,0.76,0.709
1981,0.749,0.759,0.755,0.734,0.706,0.717,0.767,0.752,0.728,0.712,0.702,0.7,0.732
1982,0.712,0.726,0.715,0.712,0.72,0.732,0.74,0.717,0.713,0.7,0.694,0.682,0.714
1983,0.689,0.699,0.698,0.676,0.687,0.699,0.724,0.74,0.769,0.737,0.766,0.812,0.725
1984,0.839,0.867,0.848,0.844,0.811,0.817,0.829,0.788,0.79,0.764,0.771,0.753,0.81


In [44]:
chicken_df.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec'],inplace=True)
# chicken_df.columns to get the long list of column names

chicken_df.head(10)

Unnamed: 0_level_0,Average Chicken Price per Pound
Year,Unnamed: 1_level_1
1980,0.709
1981,0.732
1982,0.714
1983,0.725
1984,0.81
1985,0.763
1986,0.835
1987,0.785
1988,0.854
1989,0.927


In [45]:
#Read coffe_lb_data
coffee_df = pd.read_csv(coffe_lb_data)
coffee_df.set_index('Year', inplace=True)
coffee_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,3.208,3.258,3.248,3.209,3.2,3.181,3.233,3.23,3.159,3.053,2.913,2.818
1981,2.777,2.595,2.557,2.562,2.538,2.517,2.512,2.492,2.46,2.451,2.408,2.457
1982,2.475,2.552,2.564,2.585,2.567,2.527,2.502,2.515,2.517,2.5,2.481,2.501
1983,2.528,2.519,2.478,2.466,2.459,2.47,2.454,2.451,2.438,2.432,2.454,2.437
1984,2.495,2.546,2.546,2.549,2.567,2.598,2.609,2.606,2.617,2.616,2.594,2.593


In [46]:
coffee_df.isnull().sum()

Jan    2
Feb    2
Mar    2
Apr    2
May    2
Jun    3
Jul    3
Aug    3
Sep    3
Oct    3
Nov    2
Dec    1
dtype: int64

In [47]:
coffee_df= coffee_df.apply(lambda row: row.fillna(row.mean().round(3)), axis=1)
coffee_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,3.208,3.258,3.248,3.209,3.2,3.181,3.233,3.23,3.159,3.053,2.913,2.818
1981,2.777,2.595,2.557,2.562,2.538,2.517,2.512,2.492,2.46,2.451,2.408,2.457
1982,2.475,2.552,2.564,2.585,2.567,2.527,2.502,2.515,2.517,2.5,2.481,2.501
1983,2.528,2.519,2.478,2.466,2.459,2.47,2.454,2.451,2.438,2.432,2.454,2.437
1984,2.495,2.546,2.546,2.549,2.567,2.598,2.609,2.606,2.617,2.616,2.594,2.593


In [48]:
coffee_df.tail(15)
# 2009 has entries of only 3.669 because Dec was the only row entry

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2009,3.669,3.669,3.669,3.669,3.669,3.669,3.669,3.669,3.669,3.669,3.669,3.669
2010,3.811,3.736,3.565,3.641,3.664,3.697,3.857,3.935,4.174,4.175,4.467,4.146
2011,4.417,4.218,4.642,5.101,5.129,5.234,5.547,5.766,5.651,5.511,5.636,5.437
2012,5.497,5.382,5.558,5.513,5.596,5.582,5.723,5.693,5.693,5.888,6.066,5.921
2013,5.902,5.742,6.014,5.674,5.678,5.588,5.394,5.214,5.091,5.149,5.04,4.948
2014,5.025,5.002,5.005,5.204,5.153,4.67,5.099,5.167,5.215,5.032,4.713,4.59
2015,4.738,4.91,4.827,4.99,4.715,4.686,4.79,4.808,4.669,4.609,4.412,4.486
2016,4.498,4.447,4.405,4.428,4.443,4.481,4.428,4.316,4.372,4.309,4.306,4.281
2017,4.468,4.583,4.65,4.622,4.597,4.545,4.335,4.373,4.323,4.327,4.324,4.285
2018,4.291,4.267,4.343,4.313,4.294,4.302,4.302,4.302,4.306,4.302,4.302,4.302


In [49]:
coffee_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    0
Oct    0
Nov    0
Dec    0
dtype: int64

In [50]:
coffee_df['Average Coffee Price'] = coffee_df.mean(axis=1).round(3)
coffee_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Average Coffee Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,3.208,3.258,3.248,3.209,3.2,3.181,3.233,3.23,3.159,3.053,2.913,2.818,3.142
1981,2.777,2.595,2.557,2.562,2.538,2.517,2.512,2.492,2.46,2.451,2.408,2.457,2.527
1982,2.475,2.552,2.564,2.585,2.567,2.527,2.502,2.515,2.517,2.5,2.481,2.501,2.524
1983,2.528,2.519,2.478,2.466,2.459,2.47,2.454,2.451,2.438,2.432,2.454,2.437,2.466
1984,2.495,2.546,2.546,2.549,2.567,2.598,2.609,2.606,2.617,2.616,2.594,2.593,2.578


In [51]:
coffee_df.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec'], inplace=True)
coffee_df.head()

Unnamed: 0_level_0,Average Coffee Price
Year,Unnamed: 1_level_1
1980,3.142
1981,2.527
1982,2.524
1983,2.466
1984,2.578


In [52]:
#Read electricity_data
electricity_df = pd.read_csv(electricity_data)
electricity_df.set_index('Year', inplace=True) # setting Year column as index
electricity_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,0.053,0.055,0.056,0.056,0.058,0.063,0.064,0.064,0.065,0.063,0.061,0.062
1981,0.063,0.064,0.065,0.066,0.067,0.071,0.073,0.075,0.074,0.072,0.071,0.071
1982,0.073,0.073,0.075,0.075,0.075,0.079,0.079,0.079,0.079,0.077,0.074,0.075
1983,0.075,0.075,0.076,0.075,0.077,0.081,0.082,0.082,0.082,0.08,0.077,0.077
1984,0.078,0.079,0.079,0.08,0.081,0.086,0.087,0.089,0.084,0.081,0.079,0.078


In [53]:
electricity_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    1
Oct    0
Nov    0
Dec    0
dtype: int64

In [54]:
electricity_df= electricity_df.apply(lambda row: row.fillna(row.mean()).round(3), axis=1)
electricity_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,0.053,0.055,0.056,0.056,0.058,0.063,0.064,0.064,0.065,0.063,0.061,0.062
1981,0.063,0.064,0.065,0.066,0.067,0.071,0.073,0.075,0.074,0.072,0.071,0.071
1982,0.073,0.073,0.075,0.075,0.075,0.079,0.079,0.079,0.079,0.077,0.074,0.075
1983,0.075,0.075,0.076,0.075,0.077,0.081,0.082,0.082,0.082,0.08,0.077,0.077
1984,0.078,0.079,0.079,0.08,0.081,0.086,0.087,0.089,0.084,0.081,0.079,0.078


In [55]:
electricity_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    0
Oct    0
Nov    0
Dec    0
dtype: int64

In [56]:
electricity_df['Average Electricity Price'] = electricity_df.mean(axis=1).round(3)
electricity_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Average Electricity Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,0.053,0.055,0.056,0.056,0.058,0.063,0.064,0.064,0.065,0.063,0.061,0.062,0.06
1981,0.063,0.064,0.065,0.066,0.067,0.071,0.073,0.075,0.074,0.072,0.071,0.071,0.069
1982,0.073,0.073,0.075,0.075,0.075,0.079,0.079,0.079,0.079,0.077,0.074,0.075,0.076
1983,0.075,0.075,0.076,0.075,0.077,0.081,0.082,0.082,0.082,0.08,0.077,0.077,0.078
1984,0.078,0.079,0.079,0.08,0.081,0.086,0.087,0.089,0.084,0.081,0.079,0.078,0.082


In [57]:
electricity_df.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec'], inplace=True)
electricity_df.head()

Unnamed: 0_level_0,Average Electricity Price
Year,Unnamed: 1_level_1
1980,0.06
1981,0.069
1982,0.076
1983,0.078
1984,0.082


In [58]:
#Read gasoline_data
gasoline_df = pd.read_csv(gasoline_data)
gasoline_df.set_index('Year', inplace=True)
gasoline_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,1.11,1.186,1.23,1.242,1.244,1.246,1.247,1.243,1.231,1.223,1.222,1.231
1981,1.269,1.353,1.388,1.381,1.37,1.362,1.353,1.348,1.358,1.353,1.351,1.348
1982,1.341,1.318,1.268,1.21,1.224,1.296,1.318,1.31,1.292,1.28,1.268,1.244
1983,1.214,1.17,1.135,1.198,1.243,1.261,1.272,1.269,1.257,1.239,1.224,1.215
1984,1.2,1.193,1.194,1.211,1.221,1.214,1.197,1.184,1.189,1.195,1.193,1.179


In [59]:
gasoline_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    0
Oct    0
Nov    0
Dec    0
dtype: int64

In [60]:
gasoline_df['Average Gasoline Price'] = gasoline_df.mean(axis=1).round(3)
gasoline_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Average Gasoline Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,1.11,1.186,1.23,1.242,1.244,1.246,1.247,1.243,1.231,1.223,1.222,1.231,1.221
1981,1.269,1.353,1.388,1.381,1.37,1.362,1.353,1.348,1.358,1.353,1.351,1.348,1.353
1982,1.341,1.318,1.268,1.21,1.224,1.296,1.318,1.31,1.292,1.28,1.268,1.244,1.281
1983,1.214,1.17,1.135,1.198,1.243,1.261,1.272,1.269,1.257,1.239,1.224,1.215,1.225
1984,1.2,1.193,1.194,1.211,1.221,1.214,1.197,1.184,1.189,1.195,1.193,1.179,1.198


In [61]:
gasoline_df.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec'], inplace=True)
gasoline_df.head()

Unnamed: 0_level_0,Average Gasoline Price
Year,Unnamed: 1_level_1
1980,1.221
1981,1.353
1982,1.281
1983,1.225
1984,1.198


In [62]:
#Read bread_data
bread_df = pd.read_csv(bread_data)
bread_df.set_index('Year', inplace=True)
bread_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980,0.501,0.507,0.502,0.507,0.504,0.503,0.511,0.507,0.511,0.514,0.519,0.519
1981,0.531,0.533,0.538,0.519,0.525,0.523,0.521,0.519,0.524,0.521,0.527,0.521
1982,0.537,0.534,0.526,0.526,0.529,0.525,0.534,0.534,0.536,0.534,0.534,0.537
1983,0.541,0.544,0.544,0.538,0.542,0.542,0.544,0.539,0.536,0.535,0.547,0.547
1984,0.543,0.541,0.542,0.536,0.541,0.542,0.541,0.543,0.539,0.54,0.543,0.544


In [63]:
bread_df.isnull().sum()

Jan    0
Feb    0
Mar    0
Apr    0
May    0
Jun    0
Jul    0
Aug    0
Sep    0
Oct    0
Nov    0
Dec    0
dtype: int64

In [64]:
bread_df['Average Bread Price'] = bread_df.mean(axis=1).round(3)
bread_df.head()

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Average Bread Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1980,0.501,0.507,0.502,0.507,0.504,0.503,0.511,0.507,0.511,0.514,0.519,0.519,0.509
1981,0.531,0.533,0.538,0.519,0.525,0.523,0.521,0.519,0.524,0.521,0.527,0.521,0.525
1982,0.537,0.534,0.526,0.526,0.529,0.525,0.534,0.534,0.536,0.534,0.534,0.537,0.532
1983,0.541,0.544,0.544,0.538,0.542,0.542,0.544,0.539,0.536,0.535,0.547,0.547,0.542
1984,0.543,0.541,0.542,0.536,0.541,0.542,0.541,0.543,0.539,0.54,0.543,0.544,0.541


In [65]:
bread_df.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec'], inplace=True)
bread_df.head()

Unnamed: 0_level_0,Average Bread Price
Year,Unnamed: 1_level_1
1980,0.509
1981,0.525
1982,0.532
1983,0.542
1984,0.541
