In [1]:
#Import dependencies
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
import numpy as np

In [2]:
# Load files

# Real Estate Housing Inventory records
housing_inventory_recs = '../data/housingsupplyrecords.csv'

# Mortgage Interest Rate records
mortgage_rate_recs = '../data/newmorrtgageraterecords.csv'

# Housing Price Index records
hpi_recs = '../data/newhpirecords.csv'

# Unemployment and Hourly Wages records
employment_recs = '../data/newemploymentrecords.csv'

# US Gross Domestic Product records
gdp_recs = '../data/gdprecords.csv'


In [3]:
# Read the CSV into pandas
housing_df = pd.read_csv(housing_inventory_recs)
mortgage_rate_df = pd.read_csv(mortgage_rate_recs)
hpi_df = pd.read_csv(hpi_recs)
employment_df = pd.read_csv(employment_recs)
gdp_df = pd.read_csv(gdp_recs)


In [4]:
# Real Estate Housing Inventory
housing_df

Unnamed: 0.1,Unnamed: 0,Year,Month,Housing_SA,Housing_NSA,Housing_MOM_SA,Housing_MOM_NSA,Housing_YOY_SA,Housing_YOY_NSA,Housing_Classification_SA,Housing_Classification_NSA,Year_Month
0,0,2012,2,5.4,6.8,-0.4,-0.6,,-2.6,Buyer's Market,Buyer's Market,2012-02
1,1,2012,3,5.6,5.4,0.2,-1.4,,-1.5,Buyer's Market,Buyer's Market,2012-03
2,2,2012,4,5.5,5.3,-0.1,-0.1,,-1.6,Buyer's Market,Buyer's Market,2012-04
3,3,2012,5,5.3,4.7,-0.2,-0.7,,-1.9,Buyer's Market,Balanced Market,2012-05
4,4,2012,6,5.2,4.4,-0.2,-0.3,,-1.4,Buyer's Market,Balanced Market,2012-06
...,...,...,...,...,...,...,...,...,...,...,...,...
101,101,2020,7,1.9,1.7,-0.6,-0.4,-1.0,-0.9,Seller's Market,Seller's Market,2020-07
102,102,2020,8,1.8,1.7,-0.1,0.0,-1.1,-0.9,Seller's Market,Seller's Market,2020-08
103,103,2020,9,1.6,1.7,-0.1,0.0,-1.3,-1.4,Seller's Market,Seller's Market,2020-09
104,104,2020,10,1.5,1.6,-0.1,-0.1,-1.4,-1.4,Seller's Market,Seller's Market,2020-10


In [5]:
# Create a new dataframe to hold the model data
# Extract just the Year-Month field and the Seasonally Adjusted Housing Classification field from the housing data
data_model_df = housing_df[['Year_Month','Housing_SA','Housing_Classification_SA']]
data_model_df

Unnamed: 0,Year_Month,Housing_SA,Housing_Classification_SA
0,2012-02,5.4,Buyer's Market
1,2012-03,5.6,Buyer's Market
2,2012-04,5.5,Buyer's Market
3,2012-05,5.3,Buyer's Market
4,2012-06,5.2,Buyer's Market
...,...,...,...
101,2020-07,1.9,Seller's Market
102,2020-08,1.8,Seller's Market
103,2020-09,1.6,Seller's Market
104,2020-10,1.5,Seller's Market


In [6]:
# Mortgage Interest Rate records
mortgage_rate_df

Unnamed: 0.1,Unnamed: 0,Year,Month,Rate,Rate_Change,Percent_Rate_Change,Year_Month
0,0,2021,1,2.65,-0.02,-0.007491,2021-01
1,1,2020,12,2.67,-0.05,-0.018382,2020-12
2,2,2020,11,2.72,-0.09,-0.032028,2020-11
3,3,2020,10,2.81,-0.09,-0.031034,2020-10
4,4,2020,9,2.90,-0.01,-0.003436,2020-09
...,...,...,...,...,...,...,...
128,128,2010,5,4.78,-0.28,-0.055336,2010-05
129,129,2010,4,5.06,0.07,0.014028,2010-04
130,130,2010,3,4.99,-0.06,-0.011881,2010-03
131,131,2010,2,5.05,0.07,0.014056,2010-02


In [7]:
# Extract just the Year-Month field and the Rate field from the housing data and merge with the data_model_df
columns_to_keep=['Year_Month','Rate']

new_mortgage_rate_df = mortgage_rate_df[columns_to_keep]
new_mortgage_rate_df = new_mortgage_rate_df.rename(columns={"Rate": "Mortgage_Rate"})

data_model_df = pd.merge(data_model_df, new_mortgage_rate_df, on=['Year_Month'], how='left')
data_model_df

Unnamed: 0,Year_Month,Housing_SA,Housing_Classification_SA,Mortgage_Rate
0,2012-02,5.4,Buyer's Market,3.95
1,2012-03,5.6,Buyer's Market,3.99
2,2012-04,5.5,Buyer's Market,3.88
3,2012-05,5.3,Buyer's Market,3.75
4,2012-06,5.2,Buyer's Market,3.66
...,...,...,...,...
101,2020-07,1.9,Seller's Market,2.99
102,2020-08,1.8,Seller's Market,2.91
103,2020-09,1.6,Seller's Market,2.90
104,2020-10,1.5,Seller's Market,2.81


In [8]:
# Housing Price Index records
hpi_df

Unnamed: 0.1,Unnamed: 0,Year,Month,NSA_Value,SA_Value,NSA_Change,SA_Change,NSA_PctChange,SA_PctChange,Year_Month
0,0,2021.0,11,216.362121,216.317308,2.156922,3.177881,0.010069,0.014910,2020-11
1,1,2020.0,10,214.205199,213.139427,2.167205,3.091581,0.010221,0.014718,2020-10
2,2,2020.0,9,212.037994,210.047846,2.336903,3.124809,0.011144,0.015101,2020-09
3,3,2020.0,8,209.701092,206.923037,2.583641,3.019234,0.012474,0.014807,2020-08
4,4,2020.0,7,207.117450,203.903803,2.636928,2.580970,0.012896,0.012820,2020-07
...,...,...,...,...,...,...,...,...,...,...
128,128,2010.0,3,130.753131,131.372276,0.546432,-0.105877,0.004197,-0.000805,2010-03
129,129,2010.0,2,130.206699,131.478154,-0.059507,-0.087918,-0.000457,-0.000668,2010-02
130,130,2010.0,1,130.266205,131.566072,-0.869659,-0.128171,-0.006632,-0.000973,2010-01
131,131,2010.0,12,131.135864,131.694242,-1.040088,-0.130920,-0.007869,-0.000993,2009-12


In [9]:
# Extract just the Year-Month field and the Seasonally Adjusted HPI field from the hpi data and merge with the data_model_df
columns_to_keep=['Year_Month','SA_Value']

new_hpi_df = hpi_df[columns_to_keep]
new_hpi_df = new_hpi_df.rename(columns={"SA_Value": "Housing_Price_Index_SA"})

data_model_df = pd.merge(data_model_df, new_hpi_df, on=['Year_Month'], how='left')
data_model_df

Unnamed: 0,Year_Month,Housing_SA,Housing_Classification_SA,Mortgage_Rate,Housing_Price_Index_SA
0,2012-02,5.4,Buyer's Market,3.95,121.795906
1,2012-03,5.6,Buyer's Market,3.99,122.349176
2,2012-04,5.5,Buyer's Market,3.88,123.066985
3,2012-05,5.3,Buyer's Market,3.75,123.850443
4,2012-06,5.2,Buyer's Market,3.66,124.472366
...,...,...,...,...,...
101,2020-07,1.9,Seller's Market,2.99,203.903803
102,2020-08,1.8,Seller's Market,2.91,206.923037
103,2020-09,1.6,Seller's Market,2.90,210.047846
104,2020-10,1.5,Seller's Market,2.81,213.139427


In [10]:
# Unemployment and Hourly Wages records
employment_df

Unnamed: 0.1,Unnamed: 0,year,month_code,month_name,unemployment_rate,hourly_wage,year_month
0,0,2020,12,December,6.7,29.81,2020-12
1,1,2020,11,November,6.7,29.58,2020-11
2,2,2020,10,October,6.9,29.49,2020-10
3,3,2020,9,September,7.8,29.47,2020-09
4,4,2020,8,August,8.4,29.45,2020-08
...,...,...,...,...,...,...,...
115,115,2011,5,May,9.0,22.99,2011-05
116,116,2011,4,April,9.1,22.92,2011-04
117,117,2011,3,March,9.0,22.87,2011-03
118,118,2011,2,February,9.0,22.87,2011-02


In [11]:
# Extract just the Year-Month field, and unemployment_rate field, and the hourly_wage field from the employment data and merge with the data_model_df
employment_df = employment_df.rename(columns={"year_month": "Year_Month",
                                             "unemployment_rate": "Unemployment_Rate",
                                             "hourly_wage": "Hourly_Wage"})

columns_to_keep=['Year_Month','Unemployment_Rate','Hourly_Wage']
new_employment_df = employment_df[columns_to_keep]

data_model_df = pd.merge(data_model_df, new_employment_df, on=['Year_Month'], how='left')
data_model_df

Unnamed: 0,Year_Month,Housing_SA,Housing_Classification_SA,Mortgage_Rate,Housing_Price_Index_SA,Unemployment_Rate,Hourly_Wage
0,2012-02,5.4,Buyer's Market,3.95,121.795906,8.3,23.27
1,2012-03,5.6,Buyer's Market,3.99,122.349176,8.2,23.36
2,2012-04,5.5,Buyer's Market,3.88,123.066985,8.2,23.39
3,2012-05,5.3,Buyer's Market,3.75,123.850443,8.2,23.39
4,2012-06,5.2,Buyer's Market,3.66,124.472366,8.2,23.46
...,...,...,...,...,...,...,...
101,2020-07,1.9,Seller's Market,2.99,203.903803,10.2,29.35
102,2020-08,1.8,Seller's Market,2.91,206.923037,8.4,29.45
103,2020-09,1.6,Seller's Market,2.90,210.047846,7.8,29.47
104,2020-10,1.5,Seller's Market,2.81,213.139427,6.9,29.49


In [12]:
# US Gross Domestic Product records
gdp_df

Unnamed: 0.1,Unnamed: 0,Year,Month,Nominal-GDP-Index,Real-GDP-Index,Year_Month
0,216,2010,1.0,14670.122,15363.578,2010-01
1,217,2010,2.0,14691.394,15394.092,2010-02
2,218,2010,3.0,14802.534,15487.863,2010-03
3,219,2010,4.0,14899.668,15553.910,2010-04
4,220,2010,5.0,14899.436,15521.934,2010-05
...,...,...,...,...,...,...
125,341,2020,6.0,20564.740,18160.219,2020-06
126,342,2020,7.0,20976.163,18453.268,2020-07
127,343,2020,8.0,21151.857,18576.420,2020-08
128,344,2020,9.0,21378.293,18726.640,2020-09


In [13]:
# Nominal GDP is the market value of goods and services produced in an economy, unadjusted for inflation.
# Real GDP is nominal GDP, adjusted for inflation. Select Real GDP for the model.
# Extract just the Year-Month field, and unemployment_rate field, and the hourly_wage field from the employment data and merge with the data_model_df
columns_to_keep=['Year_Month','Real-GDP-Index']
new_gdp_df = gdp_df[columns_to_keep]

gdp_df = gdp_df.rename(columns={"Real-GDP-Index": "Real_GDP"})

data_model_df = pd.merge(data_model_df, new_gdp_df, on=['Year_Month'], how='left')
data_model_df

Unnamed: 0,Year_Month,Housing_SA,Housing_Classification_SA,Mortgage_Rate,Housing_Price_Index_SA,Unemployment_Rate,Hourly_Wage,Real-GDP-Index
0,2012-02,5.4,Buyer's Market,3.95,121.795906,8.3,23.27,16269.715
1,2012-03,5.6,Buyer's Market,3.99,122.349176,8.2,23.36,16083.637
2,2012-04,5.5,Buyer's Market,3.88,123.066985,8.2,23.39,16183.036
3,2012-05,5.3,Buyer's Market,3.75,123.850443,8.2,23.39,16222.008
4,2012-06,5.2,Buyer's Market,3.66,124.472366,8.2,23.46,16191.336
...,...,...,...,...,...,...,...,...
101,2020-07,1.9,Seller's Market,2.99,203.903803,10.2,29.35,18453.268
102,2020-08,1.8,Seller's Market,2.91,206.923037,8.4,29.45,18576.420
103,2020-09,1.6,Seller's Market,2.90,210.047846,7.8,29.47,18726.640
104,2020-10,1.5,Seller's Market,2.81,213.139427,6.9,29.49,18862.228


In [14]:
# Write dataframe to csv file for use in machine learning model
data_model_df.to_csv(r'../data/machinelearningdata.csv')