In [None]:
#import all the neccesary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

## Datasets
To predict house prices using supply-demand features, three main datasets have been used.
1. Federal Reserve Interest Rates (1954 - 2017)
2. Unemployment Rate by County in the USA (2000 - 2018)
3. Zillow economics data (County_time_series and Crosswalk)

### Zillow economics data

In [None]:
county_time_series = pd.read_csv("../input/zecon/County_time_series.csv")
crosswalk = pd.read_csv("../input/zecon/CountyCrossWalk_Zillow.csv")
unemployment = pd.read_csv("../input/2018-unemployment-rate-by-county/GeoFRED_Unemployment_Rate_by_County_Percent.csv")
interest_rates = pd.read_csv("../input/interest-rates/index.csv")

In [None]:
#displaying few rows of dataset
county_time_series.head()

In [None]:
#selecting and displaying three columns from the dataset
crosswalk = crosswalk[['FIPS', 'CountyName', 'StateName']]
crosswalk["FIPS"] = crosswalk["FIPS"].astype("int64")
crosswalk.head()

#### All the data ranging from date '2000-01-01' is taken along with the RegionName and House Prices

In [None]:
new = county_time_series[county_time_series['Date'] >= '2000-01-01'][['Date', 'RegionName', 'ZHVI_AllHomes']]

In [None]:
new.head()

In [None]:
new.rename(index=str, columns={"RegionName": "FIPS", "ZHVI_AllHomes": "AverageHousePrice"}, inplace=True)
new.head()

#### The datasets Crosswalk and County_time_Series have a common column, which is 'FIPS'/'RegionName', and both of them can be merged on that basis.

In [None]:
#merged dataset
new101 = new.merge(crosswalk, on="FIPS")
new101.head()

#### To avoid clash with interest rate data, date is assumed to be the start of every month as shown below.

In [None]:
temp = []
for i in tqdm(range(len(new101))):
    date = new101.iloc[i]['Date'].split('-')[:-1]
    date.append('01')
    temp.append('-'.join(date))

In [None]:
new101['Date'] = temp 

#### Below is the merged data along with changed dates

In [None]:
new101.head()

### Unemployment Data
Merging Unemployment rate (demand factor) in our data.

In [None]:
unemployment.shape

In [None]:
unemployment.head()

In [None]:
#create a column named 'UnemploymentRate'
new101['UnemploymentRate'] = [0] * len(new101)

In [None]:
#get state avg for missing unemployment rates
def getStateAvg(fips, date):
    # unemployment[unemployment['Region Code'] in fips]
    running_unempRate, considered = 0, 0
    for i in fips:
        try:
            running_unempRate = running_unempRate + unemployment[unemployment['Region Code'] == i ][date].iloc[0]
            considered = considered + 1
        except:
            pass
    if considered == 0:
        raise('Issue in calculating the getStateAvg')
    else:
        return running_unempRate / considered

In [None]:
#merge unemployment data with zecon data
for i in tqdm(range( len(new101) )):
    regionCode = new101.iloc[i]['FIPS']
    date = new101.iloc[i]['Date']
    state = new101.iloc[i]['StateName']
    try:
        unempRate = unemployment[unemployment['Region Code'] == regionCode][date.split('-')[0]].iloc[0]
    except:
        # Put the average of that state
        # Get all the FIPS numbers
        fips = list( set( new101[new101['StateName'] == state]['FIPS'] ))
        unempRate = round( getStateAvg(fips, date.split('-')[0]), 1)
        
    new101.iloc[i, -1] = unempRate

#### The below data shows UnemploymentRate feature added to the dataset

In [None]:
new101.head()

### Interest Rates
Merging GDP and Inflation Rate (supply factors) in our data.

In [None]:
interest_rates.shape

In [None]:
interest_rates.head()

In [None]:
#filter out the important columns
interest_rates = interest_rates[['Year','Month','Day', 'Real GDP (Percent Change)', 'Inflation Rate']]
interest_rates.dropna()
interest_rates.head()

In [None]:
#merge according to 'Date' as common key from 2000
temp = []
for i in tqdm(range(len(interest_rates))):
    year = str(int(interest_rates.iloc[i]['Year']))
    month = str(int(interest_rates.iloc[i]['Month']))
    day = str(int(interest_rates.iloc[i]['Day']))
    if len(month) == 1:
        month = '0'+ month
    if len(day) == 1:
        day = '0'+ day
    date = year + '-' + month + '-' + day
    temp.append(date)

In [None]:
interest_rates['Date'] = temp

In [None]:
interest_rates.head()

In [None]:
#filtering out the numerical factors
interest_rates = interest_rates[['Real GDP (Percent Change)', 'Inflation Rate', 'Date']]
interest_rates.head()

In [None]:
#merge according to date
new102 = new101.merge(interest_rates, on ='Date')
new102.head() 

#### The below dataset is compilation of zecon, unemployment and interest rates data along with 'FIPS' (it represents location) and 'Date' ranging from 2000 to 2017

In [None]:
new102.shape

In [None]:
#checking missing values
new102.isnull().sum()

#### Tackling missing values in data columns by calculating the mean of the column and replacing missing values with it.

In [None]:
mean_avghouseprice = new102['AverageHousePrice'].mean()
new102['AverageHousePrice'].fillna(value=mean_avghouseprice, inplace=True)

In [None]:
mean_gdp = new102['Real GDP (Percent Change)'].mean()
new102['Real GDP (Percent Change)'].fillna(value=mean_gdp, inplace=True)

In [None]:
mean_irate = new102['Inflation Rate'].mean()
new102['Inflation Rate'].fillna(value=mean_irate, inplace=True)

In [None]:
new102.tail()

## Final Data

#### As 'Date' is not going to be a numerical feature for our model to train as we have all the data from 2000 to 2017, we can drop it and create our final dataset. Shown below.

In [None]:
final_data = new102[['FIPS','AverageHousePrice', 'UnemploymentRate', 'Real GDP (Percent Change)', 'Inflation Rate']]
final_data.head()

## Data Visualisation

In [None]:
#Plotting histograms for all features in the data set
for i in final_data.columns:
    plt.figsize=(5,5)
    plt.hist(new102[i])
    plt.title(i)
    plt.show()

In [None]:
#Plotting correlation matrix for all features in the dataset
corrmat = final_data.corr()
plt.subplots(figsize=(6, 6))
sns.heatmap(corrmat.abs(), vmax=.4, square=True)

In [None]:
#Correlation of each feature with house price
print ("The correlation between a house price and unemployment rate is: {0}".format(corrmat["UnemploymentRate"]["AverageHousePrice"]))
print ("The correlation between a house price and Real GDP is: {0}".format(corrmat["Real GDP (Percent Change)"]["AverageHousePrice"]))
print ("The correlation between a house price and Inflation Rate is: {0}".format(corrmat["Inflation Rate"]["AverageHousePrice"]))
print ("The correlation between a house price and FIPS is: {0}".format(corrmat["FIPS"]["AverageHousePrice"]))

## Model

In [None]:
#Model
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

xgboost = XGBRegressor(objective ='reg:linear',
                       colsample_bytree = 0.3, 
                       learning_rate = 0.1,
                       max_depth = 5, 
                       alpha = 10, 
                       random_state=777,
                       n_estimators = 100)

In [None]:
#shuffling data before splitting into train and test
final_data.sample(frac = 1)

In [None]:
#splitting the data into 'training' and 'testing'
x = final_data[['FIPS','UnemploymentRate', 'Real GDP (Percent Change)', 'Inflation Rate']]
y = final_data[['AverageHousePrice']] 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
#predicting the model metric
xgboost.fit(x_train, y_train)
y_pred = xgboost.predict(x_test)
print("mse of the model : ", mean_squared_error(y_test, y_pred)) 

In [None]:
print("This is the importance of each feature: {0}".format(xgboost.feature_importances_))

In [None]:
#Plot showing feature importance
from xgboost import plot_importance
plot_importance(xgboost)
plt.show()

#### Checking how the model works.

In [None]:
# Generating a few examples
print ("These are the first five entries")
print (x_test[:5])
print ("These are the model prediction for these entries")
print (xgboost.predict(x_test[:5]))
print ("These are the actual prices")
print (y_test[:5])