In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

import statsmodels.api as sm

In [2]:
df = pd.read_csv('eda_data')

In [3]:
df.columns

Index(['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'BUILDING CLASS AT PRESENT',
       'ZIP CODE', 'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE', 'age', 'price_m', 'MONTH SOLD'],
      dtype='object')

In [4]:
df_model = df[['SALE PRICE', 'age', 'MONTH SOLD', 'LAND SQUARE FEET', 'TOTAL UNITS', 'BUILDING CLASS CATEGORY', 
               'BOROUGH', 'NEIGHBORHOOD', 'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT']]

In [5]:
df.shape

(26841, 21)

In [6]:
df_dummy = pd.get_dummies(df_model)

In [7]:
df_dummy.shape

(26841, 375)

In [8]:
X = df_dummy.drop('SALE PRICE', axis=1)
y = df_dummy['SALE PRICE'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Stats model linreg

In [10]:
X_sm = X = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [11]:
model = sm.OLS(y,X_sm)
model.fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.626
Model:,OLS,Adj. R-squared:,0.621
Method:,Least Squares,F-statistic:,127.4
Date:,"Mon, 16 Jan 2023",Prob (F-statistic):,0.0
Time:,11:55:31,Log-Likelihood:,-380010.0
No. Observations:,26841,AIC:,760700.0
Df Residuals:,26492,BIC:,763600.0
Df Model:,348,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.504e+06,2.32e+05,10.816,0.000,2.05e+06,2.96e+06
age,-1659.2708,92.147,-18.007,0.000,-1839.884,-1478.658
MONTH SOLD,-1340.1451,607.872,-2.205,0.027,-2531.607,-148.684
LAND SQUARE FEET,80.4802,1.877,42.876,0.000,76.801,84.159
TOTAL UNITS,36.7661,171.208,0.215,0.830,-298.812,372.344
BOROUGH,-5.817e+05,7.87e+04,-7.391,0.000,-7.36e+05,-4.27e+05
BUILDING CLASS CATEGORY_01 ONE FAMILY DWELLINGS,-4.271e+05,1.59e+05,-2.689,0.007,-7.38e+05,-1.16e+05
BUILDING CLASS CATEGORY_02 TWO FAMILY DWELLINGS,-3.188e+05,1.44e+05,-2.212,0.027,-6.01e+05,-3.63e+04
BUILDING CLASS CATEGORY_03 THREE FAMILY DWELLINGS,-2.489e+04,2.04e+05,-0.122,0.903,-4.24e+05,3.74e+05

0,1,2,3
Omnibus:,6449.972,Durbin-Watson:,1.776
Prob(Omnibus):,0.0,Jarque-Bera (JB):,150228.727
Skew:,0.605,Prob(JB):,0.0
Kurtosis:,14.527,Cond. No.,6.25e+19


Random Forest Model

In [12]:
rf = RandomForestRegressor()

In [13]:
np.mean(cross_val_score(rf,X_train,y_train, scoring = 'neg_mean_absolute_error', cv=3))

-215668.0679085122

Grid search tuning

In [40]:
parameters = {'criterion':('mse', 'mae'), 'max_features':('auto', 'sqrt')}

In [41]:
gs = GridSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3, n_jobs=-1)

In [42]:
gs.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ('mse', 'mae'),
                         'max_features': ('auto', 'sqrt')},
             scoring='neg_mean_absolute_error')

In [43]:
gs.best_score_

-212397.92211896225

In [44]:
gs.best_estimator_

RandomForestRegressor(criterion='mae', max_features='sqrt')

Test model

In [45]:
tpred_rf = gs.best_estimator_.predict(X_test)

In [47]:
mean_absolute_error(y_test, tpred_rf)

213391.6244924567