In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse,rmse
from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV,ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [12]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
houseprices_df = pd.read_sql_query('select * from houseprices',con=engine)


# no need for an open connection, as we're only doing a single query
engine.dispose()

AttributeError: 'OptionEngine' object has no attribute 'execute'

In [None]:
house_prices_df.head()

In [None]:
house_prices_df.shape

In [None]:
house_prices_df.info()

In [None]:
house_prices_df.describe()

In [None]:
# Some columns are numerical & some are non-numerical
non_numeric_columns = house_prices_df.select_dtypes(['object']).columns
print(non_numeric_columns)
print('The number of non-numeric columns is {}'.format(len(non_numeric_columns)))

In [None]:
numeric_columns = house_prices_df.select_dtypes(['int64','float']).columns
print(numeric_columns)
print('The number of numeric columns is {}'.format(len(numeric_columns)))

In [None]:
# Missing Data

total_missing = house_prices_df.isnull().sum().sort_values(ascending = False)
percent_missing = (house_prices_df.isnull().sum()/house_prices_df.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total_missing,percent_missing], axis =1, keys = ['Total','Percent'])
missing_data.head(25)

In [None]:
# We see 19 features has missing values

In [None]:
# Exploratory data analysis and feature selection
plt.hist(house_prices_df.saleprice)
plt.title('The distribution of sale prices')
plt.xlabel('sale prices')
plt.ylabel('number of occurrence')
plt.show()

In [None]:
# We derive correlations of the features with our target variable irrespective of their signs by taking absolute values
np.abs(house_prices_df[numeric_columns].iloc[:,1:].corr().loc[:,'saleprice']).sort_values(ascending = False)

In [None]:
# To understand the univariate relation between our target variable and non-numerical features
plt.figure(figsize = (40,60))
for index, column in enumerate(non_numeric_columns):
    plt.subplot(11,4,index+1)
    plt.bar(house_prices_df.groupby(column)['saleprice'].mean().index,house_prices_df.groupby(column)["saleprice"].mean(),
           color=("grey","green"))
    plt.title("Average saleprice wrt. {}".format(column))
    plt.ylabel("Average sale price")
    plt.xlabel(column)
    plt.xticks(rotation='vertical')

plt.tight_layout()
plt.show()

In [None]:
# As our model, we choose the 5 numerical variables that are most correlated with the saleprice. We also add 2 non-numerical features.

house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning,prefix = 'mszoning',drop_first = True)],axis =1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street,prefix ='street',drop_first = True)],axis =1)
dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning,prefix = 'mszoning',drop_first = True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix = 'street',drop_first = True).columns)


In [None]:
# Regression Model
house_prices_df['totalsf'] = house_prices_df['totalbsmtsf'] + house_prices_df['firstflrsf']+ house_prices_df['secondflrsf']
house_prices_df['int_over_sf']= house_prices_df['totalsf']*house_prices_df['overallqual']
# Y is the target variable
Y = np.log1p(house_prices_df['saleprice'])
# X is the feature
X = house_prices_df[['overallqual','grlivarea','garagecars','garagearea','totalsf','int_over_sf'] + dummy_column_names]
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 465)
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

In [None]:
lrm = LinearRegression()
lrm.fit(X_train,y_train)
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)
print("R-squared of the model in training set is: {}".format(lrm.score(X_train,y_train)))
print("-----Test set statistics----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test,y_test)))
print("Mean absolute error of the prediction is : {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is : {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

In [None]:
lasso_cv = LassoCV(alphas= alphas, cv =5)
lasso_cv.fit(X_train,y_train)
# We are making predictions
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)
print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model is : {}".format(lasso_cv.score(X_train,y_train)))
print("----Test set statistics----")
print("R-squared of the model in test set is : {}".format(lasso_cv.score(X_test,y_test)))
print(" Mean absolute error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean  squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

In [None]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train,y_train)
# We are making predictions
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)
print("Best alpha value is : {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is : {}".format(ridge_cv.score(X_train,y_train)))
print("----Test set statistics----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test,y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

In [None]:
elasticnet_cv = ElasticNetCV(alphas = alphas, cv =5)
elasticnet_cv.fit(X_train,y_train)
# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)
print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-sqaured of the model in training set is: {}".format(elasticnet_cv.score(X_train,y_train)))
print("---Test set statistics---")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test,y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test,y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test,y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test,y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test-y_preds_test)/y_test))*100))

In [None]:
# From the prediction results we find that the best model is the OLS regression model

In [None]:
# Integrating new factors that affect the target variable but are not there in the dataset 

In [None]:
yr_sold= pd.DataFrame(house_prices_df.yrsold)
yr_sold

In [None]:
yr_sold.describe()

In [14]:
# Economic factors affecting house prices: mortgage rate, unemployment rate,gdp, population growth, federal funds rate, cpi

mortgage_rate= pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/MORTGAGE30US.csv')
unemployment_rate = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/UNEMPLOYMENTRATE.csv')
gdp= pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/GDP.csv')
pop_growth = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/SPPOPGROWUSA.csv')
federal_funds_rate = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/Federal_funds_rate.csv')
cpi = pd.read_csv('C:/Users/chowd/OneDrive/Desktop/Thinkful assignments/CPI.csv')

In [15]:
mortgage_rate['DATE'] = pd.to_datetime(mortgage_rate['DATE'])
unemployment_rate['DATE']= pd.to_datetime(unemployment_rate['DATE'])
gdp['DATE'] = pd.to_datetime(gdp['DATE'])
pop_growth['DATE'] = pd.to_datetime(pop_growth['DATE'])
federal_funds_rate['DATE']= pd.to_datetime(federal_funds_rate['DATE'])
cpi['DATE'] = pd.to_datetime(cpi['DATE'])

In [16]:
mortgage_rate= mortgage_rate[mortgage_rate['DATE'].dt.year >= 2006]
unemployment_rate = unemployment_rate[unemployment_rate['DATE'].dt.year >= 2006]
gdp = gdp[gdp['DATE'].dt.year >= 2006]
federal_funds_rate= federal_funds_rate[federal_funds_rate['DATE'].dt.year >= 2006]
cpi = cpi[cpi['DATE'].dt.year >= 2006]

In [None]:
house_prices_df.yrsold.head(10)

In [None]:
mortgage_rate.head(10)

In [19]:
mortgage_df= mortgage_rate.groupby([mortgage_rate['DATE'].dt.year]).mean()
unemployment_df = unemployment_rate.groupby([unemployment_rate['DATE'].dt.year]).mean()
gdp_df = gdp.groupby([gdp['DATE'].dt.year]).mean()
federal_funds_df = federal_funds_rate.groupby([federal_funds_rate['DATE'].dt.year]).mean()
cpi_df = cpi.groupby([cpi['DATE'].dt.year]).mean()

In [None]:
mortgage_df.index

In [None]:
merge_df = pd.merge(house_prices_df,mortgage_df,left_on= 'yrsold',right_index= True,how = 'left')
house_prices_df['Final_mortgage_rate'] = merge_df['MORTGAGE30US']

In [None]:
merge_df = pd.merge(house_prices_df,unemployment_df,left_on= 'yrsold',right_index= True,how = 'left')
house_prices_df['Unemployment Rate'] = merge_df['UNRATE']

In [None]:
merge_df = pd.merge(house_prices_df,gdp_df,left_on= 'yrsold',right_index= True,how = 'left')
house_prices_df['Gdp'] = merge_df['GDP']

In [None]:
merge_df = pd.merge(house_prices_df,federal_funds_df,left_on= 'yrsold',right_index = True ,how = 'left')
house_prices_df['Federal funds rate'] = merge_df['DFF']

In [None]:
merge_df = pd.merge(house_prices_df,cpi_df,left_on= 'yrsold',right_index= True,how = 'left')
house_prices_df['Cpi'] = merge_df['FPCPITOTLZGUSA']
house_prices_df

In [None]:
house_prices_df['Federal funds rate'].head(3)

In [20]:
federal_funds_df.head(3)

Unnamed: 0_level_0,DFF
DATE,Unnamed: 1_level_1
2019,2.129
2020,0.372
2021,0.079


In [21]:
gdp_df.head(3)

Unnamed: 0_level_0,GDP
DATE,Unnamed: 1_level_1
2006,13815.583
2007,14474.227
2008,14769.862
