In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import features
import numpy as np
import matplotlib.pyplot as plt

In [3]:
ames = pd.read_csv("../data/engineered.csv")
ames.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
model = LinearRegression()

# Select target and features to regress on
target = ames.LogSalePrice
X = ames[[
    # Size
    'GrLivArea', 'AllBathrooms', 'BedroomAbvGr',
    # Type
    'MSSubClass', 'MSZoning',
    # Niceness
    'OverallQual', 'OverallCond', 'Neighborhood', 'KitchenQual',
    # Sale
    'SaleCondition', 'YrSold',
    # Features
    'CentralAir', 'Fireplaces',
    # Others
    'IsNearNegativeCondition', 'LandContour'
]]
X

Unnamed: 0,GrLivArea,AllBathrooms,BedroomAbvGr,MSSubClass,MSZoning,OverallQual,OverallCond,Neighborhood,KitchenQual,SaleCondition,YrSold,CentralAir,Fireplaces,IsNearNegativeCondition,LandContour
0,856,2.0,2,Dwelling_30,RL,6,6,SWISU,3,Normal,Yr_2010,Y,1,False,Lvl
1,1049,3.0,2,Dwelling_120,RL,5,5,Edwards,4,Normal,Yr_2009,Y,0,False,Lvl
2,1001,1.0,2,Dwelling_30,C (all),5,9,IDOTRR,4,Normal,Yr_2007,Y,0,False,Lvl
3,1039,1.0,2,Dwelling_70,RL,4,8,OldTown,3,Normal,Yr_2009,Y,0,False,Lvl
4,1665,3.5,3,Dwelling_60,RL,8,6,NWAmes,4,Normal,Yr_2009,Y,0,False,Lvl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2572,952,1.0,2,Dwelling_30,RL,6,6,BrkSide,2,Normal,Yr_2009,N,1,False,Lvl
2573,1733,2.0,4,Dwelling_20,RL,3,5,Edwards,3,Normal,Yr_2009,Y,1,False,Lvl
2574,2002,2.0,4,Dwelling_90,RH,5,6,Crawfor,3,Normal,Yr_2007,N,0,False,HLS
2575,1842,3.5,3,Dwelling_60,RL,7,5,CollgCr,4,Normal,Yr_2007,Y,1,False,Lvl


In [5]:
# Dummify categorical features from all features selected for our model
all_categorical_features = set(features.get_categorical_features())
X_categorical_features = list(all_categorical_features.intersection(set(X.columns)))
X = pd.get_dummies(X, columns=X_categorical_features, prefix=X_categorical_features, drop_first=True)
X

Unnamed: 0,GrLivArea,AllBathrooms,BedroomAbvGr,OverallQual,OverallCond,KitchenQual,Fireplaces,IsNearNegativeCondition,MSZoning_C (all),MSZoning_FV,...,MSSubClass_Dwelling_85,MSSubClass_Dwelling_90,LandContour_HLS,LandContour_Low,LandContour_Lvl,YrSold_Yr_2007,YrSold_Yr_2008,YrSold_Yr_2009,YrSold_Yr_2010,CentralAir_Y
0,856,2.0,2,6,6,3,1,False,0,0,...,0,0,0,0,1,0,0,0,1,1
1,1049,3.0,2,5,5,4,0,False,0,0,...,0,0,0,0,1,0,0,1,0,1
2,1001,1.0,2,5,9,4,0,False,1,0,...,0,0,0,0,1,1,0,0,0,1
3,1039,1.0,2,4,8,3,0,False,0,0,...,0,0,0,0,1,0,0,1,0,1
4,1665,3.5,3,8,6,4,0,False,0,0,...,0,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2572,952,1.0,2,6,6,2,1,False,0,0,...,0,0,0,0,1,0,0,1,0,0
2573,1733,2.0,4,3,5,3,1,False,0,0,...,0,0,0,0,1,0,0,1,0,1
2574,2002,2.0,4,5,6,3,0,False,0,0,...,0,1,1,0,0,1,0,0,0,0
2575,1842,3.5,3,7,5,4,1,False,0,0,...,0,0,0,0,1,1,0,0,0,1


In [6]:
# Create training set and holdout
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=42)

In [7]:
model.fit(X=X_train, y=y_train)
y_predict = model.predict(X_test)


In [8]:
# Score model
print(f"Coefficient of Determination training set: {model.score(X=X_train, y=y_train)}")
print(f"Coefficient of Determination holdout set: {model.score(X=X_test, y=y_test)}")
print(f"MSE: {mean_squared_error(y_true=y_test, y_pred=y_predict)}")

Coefficient of Determination training set: 0.9106266488474948
Coefficient of Determination holdout set: 0.8896179322540221
MSE: 0.01691762133398028


In [9]:
# Beta coefficients
beta_names = np.insert(model.feature_names_in_, 0, "__Intercept__")
log_coefficients = np.insert(model.coef_, 0, model.intercept_)
unlogged_coefficients = np.insert(np.exp(model.coef_), 0, np.exp(model.intercept_))
coeff_array = np.vstack([beta_names, log_coefficients, unlogged_coefficients]).T
pd.DataFrame(coeff_array, columns=['Features', 'Coefficients (log)', 'Coefficients'])

Unnamed: 0,Features,Coefficients (log),Coefficients
0,__Intercept__,10.199127,26879.702386
1,GrLivArea,0.00031,1.00031
2,AllBathrooms,0.065081,1.067245
3,BedroomAbvGr,-0.025245,0.975071
4,OverallQual,0.078074,1.081202
...,...,...,...
65,YrSold_Yr_2007,-0.017802,0.982356
66,YrSold_Yr_2008,-0.017457,0.982695
67,YrSold_Yr_2009,-0.032092,0.968417
68,YrSold_Yr_2010,-0.010296,0.989757
