In [1]:
#Import dependencies
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
import numpy as np

In [2]:
#Load GDP data file
gdp_file = '../Resources/US-Monthly-GDP-History-Data.csv'


In [3]:
# Read the CSV in pandas
gdp_df = pd.read_csv(gdp_file)
gdp_df


Unnamed: 0.1,Unnamed: 0,Monthly Nominal GDP Index,Monthly Real GDP Index
0,1992 - Jan,6315.826,9480.081
1,1992 - Feb,6356.427,9521.412
2,1992 - Mar,6417.053,9600.052
3,1992 - Apr,6443.275,9606.322
4,1992 - May,6433.980,9588.478
...,...,...,...
341,2020 - Jun,20564.740,18160.219
342,2020 - Jul,20976.163,18453.268
343,2020 - Aug,21151.857,18576.420
344,2020 - Sep,21378.293,18726.640


In [4]:
gdp_df.columns

Index(['Unnamed: 0', 'Monthly Nominal GDP Index', 'Monthly Real GDP Index'], dtype='object')

In [5]:
gdp_df = gdp_df.rename(columns={"Unnamed: 0": "Date"})
gdp_df

Unnamed: 0,Date,Monthly Nominal GDP Index,Monthly Real GDP Index
0,1992 - Jan,6315.826,9480.081
1,1992 - Feb,6356.427,9521.412
2,1992 - Mar,6417.053,9600.052
3,1992 - Apr,6443.275,9606.322
4,1992 - May,6433.980,9588.478
...,...,...,...
341,2020 - Jun,20564.740,18160.219
342,2020 - Jul,20976.163,18453.268
343,2020 - Aug,21151.857,18576.420
344,2020 - Sep,21378.293,18726.640


In [6]:
# Split the year into month and year; drop the 'Date' and 'Day' columns
gdp_df[['Year','Month']] = gdp_df.Date.str.split(" - ",expand=True)
gdp_df= gdp_df.drop(['Date'], axis = 1) 
gdp_df= gdp_df.rename(columns={"Monthly Nominal GDP Index": "Nominal-GDP-Index",
                               "Monthly Real GDP Index": "Real-GDP-Index"})
gdp_df

Unnamed: 0,Nominal-GDP-Index,Real-GDP-Index,Year,Month
0,6315.826,9480.081,1992,Jan
1,6356.427,9521.412,1992,Feb
2,6417.053,9600.052,1992,Mar
3,6443.275,9606.322,1992,Apr
4,6433.980,9588.478,1992,May
...,...,...,...,...
341,20564.740,18160.219,2020,Jun
342,20976.163,18453.268,2020,Jul
343,21151.857,18576.420,2020,Aug
344,21378.293,18726.640,2020,Sep


In [7]:
# Change the month name to a two-digit month code
# Create a dictionary of month_code keys & values
month_code_dict = {'Jan': '01',
                   'Feb': '02',
                   'Mar': '03',
                   'Apr': '04',
                   'May': '05',
                   'Jun': '06',
                   'Jul': '07',
                   'Aug': '08',
                   'Sep': '09',
                   'Oct': '10',
                   'Nov': '11',
                   'Dec': '12'
                  }

# Remap the values of the new month_code field using the dictionary
gdp_df['Month'] = gdp_df['Month'].map(month_code_dict)

gdp_df

Unnamed: 0,Nominal-GDP-Index,Real-GDP-Index,Year,Month
0,6315.826,9480.081,1992,01
1,6356.427,9521.412,1992,02
2,6417.053,9600.052,1992,03
3,6443.275,9606.322,1992,04
4,6433.980,9588.478,1992,05
...,...,...,...,...
341,20564.740,18160.219,2020,06
342,20976.163,18453.268,2020,07
343,21151.857,18576.420,2020,08
344,21378.293,18726.640,2020,09


In [8]:
# Re-order the columns in the new_hpi_df dataframe
gdp_df = gdp_df[['Year', 'Month', 'Nominal-GDP-Index', 'Real-GDP-Index']]
gdp_df

Unnamed: 0,Year,Month,Nominal-GDP-Index,Real-GDP-Index
0,1992,01,6315.826,9480.081
1,1992,02,6356.427,9521.412
2,1992,03,6417.053,9600.052
3,1992,04,6443.275,9606.322
4,1992,05,6433.980,9588.478
...,...,...,...,...
341,2020,06,20564.740,18160.219
342,2020,07,20976.163,18453.268
343,2020,08,21151.857,18576.420
344,2020,09,21378.293,18726.640


In [9]:
# Create a new field, year_month, containing month_name and year, for ease of sorting during plotting and charting
gdp_df['Year_Month'] = gdp_df['Year'] + '-' + gdp_df['Month']
gdp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdp_df['Year_Month'] = gdp_df['Year'] + '-' + gdp_df['Month']


Unnamed: 0,Year,Month,Nominal-GDP-Index,Real-GDP-Index,Year_Month
0,1992,01,6315.826,9480.081,1992-01
1,1992,02,6356.427,9521.412,1992-02
2,1992,03,6417.053,9600.052,1992-03
3,1992,04,6443.275,9606.322,1992-04
4,1992,05,6433.980,9588.478,1992-05
...,...,...,...,...,...
341,2020,06,20564.740,18160.219,2020-06
342,2020,07,20976.163,18453.268,2020-07
343,2020,08,21151.857,18576.420,2020-08
344,2020,09,21378.293,18726.640,2020-09


In [10]:
# Convert 'Year' data type to numeric to select records for years in or after 2010
gdp_df['Year'] = gdp_df['Year'].apply(pd.to_numeric)
gdp_df  = gdp_df.loc[gdp_df["Year"] >= 2010]
gdp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdp_df['Year'] = gdp_df['Year'].apply(pd.to_numeric)


Unnamed: 0,Year,Month,Nominal-GDP-Index,Real-GDP-Index,Year_Month
216,2010,01,14670.122,15363.578,2010-01
217,2010,02,14691.394,15394.092,2010-02
218,2010,03,14802.534,15487.863,2010-03
219,2010,04,14899.668,15553.910,2010-04
220,2010,05,14899.436,15521.934,2010-05
...,...,...,...,...,...
341,2020,06,20564.740,18160.219,2020-06
342,2020,07,20976.163,18453.268,2020-07
343,2020,08,21151.857,18576.420,2020-08
344,2020,09,21378.293,18726.640,2020-09


In [11]:
# Write dataframe to csv file for plotting data in Tableau
gdp_df.to_csv(r'../data/gdprecords.csv')