In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error



In [154]:
df=pd.read_csv("C:/Users/Cemil Turhan/Desktop/MachineLearning_FinalProject/Numerical_train.csv", index_col=0)

In [155]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandSlope', 'OverallQual', 'OverallCond',
       ...
       'MiscFeature_Shed', 'MiscFeature_TenC', 'SaleType_CWD', 'SaleType_Con',
       'SaleType_ConLD', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New',
       'SaleType_Oth', 'SaleType_WD'],
      dtype='object', length=190)

In [156]:
corr_list = sorted(df.corr().to_dict()['SalePrice'].items(), key=lambda x: x[1], reverse=True)
corr_list

[('SalePrice', 1.0),
 ('OverallQual', 0.7910686627656242),
 ('GrLivArea', 0.708617613678389),
 ('ExterQual', 0.6826773104249984),
 ('KitchenQual', 0.6598139270753497),
 ('GarageCars', 0.6404729026424087),
 ('GarageArea', 0.6234228990077125),
 ('TotalBsmtSF', 0.6139049674835603),
 ('1stFlrSF', 0.6059678567747742),
 ('BsmtQual', 0.5853487025846937),
 ('FullBath', 0.5608806241605117),
 ('GarageFinish', 0.5498090492718033),
 ('TotRmsAbvGrd', 0.5337788652255514),
 ('YearBuilt', 0.5232730605856432),
 ('FireplaceQu', 0.5205072958450804),
 ('YearRemodAdd', 0.5074302189394639),
 ('Foundation_PConc', 0.49808300465143923),
 ('GarageType', 0.48992659823626533),
 ('MasVnrArea', 0.47260577576654095),
 ('Fireplaces', 0.4669680455516374),
 ('HeatingQC', 0.42763871622441324),
 ('Neighborhood_NridgHt', 0.40213287534154485),
 ('BsmtFinSF1', 0.3864362820990178),
 ('BsmtExposure', 0.37467626563117373),
 ('SaleType_New', 0.3574890978618135),
 ('MasVnrType_Stone', 0.33045385618379947),
 ('Neighborhood_NoRidg

In [157]:
df = df.drop(labels = ['Condition1_RRAn', 'RoofMatl_Tar&Grv','Condition1_RRNe','BldgType_TwnhsE','Condition2_RRAe','Foundation_Wood', 'RoofMatl_Metal', 'RoofStyle_Mansard', 'BsmtFinType2', 'LotConfig_FR2'] , axis=1).astype("float64")

In [158]:
target=df.SalePrice

In [159]:
logged_target=np.log(target)

In [160]:
df = df.drop(labels = 'SalePrice', axis=1).astype("float64")

In [161]:
type(target)

pandas.core.series.Series

In [162]:
X_train, X_test, y_train, y_test = train_test_split(df, logged_target)

In [163]:
linear=LinearRegression()

In [164]:
linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [165]:
y_pred=linear.predict(X_test)

In [166]:
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred)

In [167]:
print(test_mse, test_rmse, test_mae)

0.020106376120225294 0.141796953846778 0.09894366976856446


In [168]:
cross_v = cross_val_score(linear, X_train, y_train, cv=5, 
 scoring="neg_mean_squared_error")

In [169]:
cross_v

array([-0.02138479, -0.10690081, -0.04879636, -0.01788371, -0.02913326])

In [170]:
linear.score(X_train, y_train)

0.9273341894176079

In [172]:
linear.score(X_test, y_test)

0.8578371289219523

In [97]:
lasso=Lasso()

In [98]:
alpha=[0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1]


In [99]:
param_grid = dict(alpha=alpha)

In [100]:
lasso_cv = GridSearchCV(estimator = lasso, param_grid = param_grid, cv=5)

In [101]:
lasso_cv.fit(X_train, y_train)

  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5,
                                   0.75, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [102]:
lasso_cv.best_params_

{'alpha': 0.001}

In [103]:
lasso_best=Lasso(alpha=0.001)

In [104]:
print(cross_val_score(lasso_best, X_train, y_train, cv=5))

[0.90565221 0.90984829 0.90279822 0.91762847 0.82784557]


In [105]:
lasso_best.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [107]:
lasso_best.score(X_test, y_test)

0.7685471319056038

In [108]:
ridge=Ridge()

In [116]:
alpha=[0.05, 0.1, 1, 5, 10, 20, 50, 100, 150, 200, 250]

In [117]:
param_grid = dict(alpha=alpha)

In [118]:
ridge_cv = GridSearchCV(estimator = ridge, param_grid = param_grid, cv=5)

In [119]:
ridge_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.05, 0.1, 1, 5, 10, 20, 50, 100, 150, 200,
                                   250]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [120]:
ridge_cv.best_params_

{'alpha': 10}

In [121]:
ridge_best=Ridge(alpha=10)

In [122]:
print(cross_val_score(ridge_best, X_train, y_train, cv=5))

[0.90541508 0.90619252 0.8988905  0.92125688 0.82885194]


In [123]:
ridge_best.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [124]:
ridge_best.score(X_test, y_test)

0.7854776682503812

In [125]:
rf=RandomForestRegressor()

In [126]:
n_estimators = [100, 300, 500, 750, 1000, 1500]
max_depth = [2, 5, 10, 30, 50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5, 10]

In [127]:
param_grid = dict(n_estimators = n_estimators,
                  max_depth = max_depth,
                  min_samples_split = min_samples_split,
                  min_samples_leaf = min_samples_leaf)

In [128]:
rf_cv = GridSearchCV(estimator = rf, param_grid = param_grid, cv=5)

In [129]:
rf_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [130]:
best_params= rf_cv.best_params_

In [131]:
best_params

{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [133]:

rf_best = RandomForestRegressor(**best_params)

In [134]:
print(cross_val_score(rf_best, X_train, y_train, cv=5))

[0.87253915 0.88464844 0.86471996 0.86586554 0.85999522]


In [135]:
rf_best.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [136]:
rf_best.score(X_test, y_test)

0.8644029519687522

 ## Now, try models on the subset with only top 4 variables accounted for most of the accuracy in the model (obtained before)

In [144]:
feature_importances = sorted(zip(X_train.columns, rf_best.feature_importances_), reverse=True, key=lambda x: x[1])

In [141]:
feature_df = pd.DataFrame(feature_importances, columns=['feature', 'importance'])

In [142]:
feature_df

Unnamed: 0,feature,importance
0,OverallQual,0.550778
1,GrLivArea,0.106880
2,TotalBsmtSF,0.042295
3,GarageCars,0.032372
4,GarageArea,0.027959
...,...,...
174,RoofMatl_Membran,0.000000
175,Exterior1st_AsphShn,0.000000
176,Exterior1st_ImStucc,0.000000
177,MiscFeature_Othr,0.000000


In [173]:
subset=df[["OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars"]]

In [174]:
 X_train, X_test, y_train, y_test = train_test_split(subset, logged_target)

### Linear

In [146]:
linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [147]:
y_pred=linear.predict(X_test)

In [148]:
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred)

In [149]:
print(test_mse, test_rmse, test_mae)

0.025576190407701194 0.15992557771570248 0.12083370443836111


In [150]:
cross_v = cross_val_score(linear, X_train, y_train, cv=5, 
 scoring="neg_mean_squared_error")

In [151]:
cross_v

array([-0.03527024, -0.05559637, -0.03197567, -0.02148517, -0.04258881])

In [152]:
linear.score(X_train, y_train)

0.7840073913001724

In [153]:
linear.score(X_test, y_test)

0.8227846901508706

### Lasso

In [175]:
alpha=[0.00001, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1]

In [176]:
param_grid = dict(alpha=alpha)

In [177]:
lasso_cv = GridSearchCV(estimator = lasso, param_grid = param_grid, cv=5)

In [178]:
lasso_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.25, 0.5,
                                   0.75, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [180]:
best_param=lasso_cv.best_params_

In [181]:
best_param

{'alpha': 1e-05}

In [182]:
lasso_best=Lasso(**best_param)

In [183]:
print(cross_val_score(lasso_best, X_train, y_train, cv=5))

[0.84599    0.58825006 0.82552892 0.81682916 0.8049107 ]


In [184]:
lasso_best.fit(X_train, y_train)

Lasso(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [185]:
lasso_best.score(X_test, y_test)

0.7834689190324025

### Ridge

In [186]:
alpha=[0.05, 0.1, 1, 5, 10, 20, 50, 100, 150, 200, 250]

In [187]:
param_grid = dict(alpha=alpha)

In [188]:
ridge_cv = GridSearchCV(estimator = ridge, param_grid = param_grid, cv=5)

In [189]:
ridge_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.05, 0.1, 1, 5, 10, 20, 50, 100, 150, 200,
                                   250]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [190]:
best_param=ridge_cv.best_params_

In [191]:
best_param

{'alpha': 0.05}

In [192]:
ridge_best=Ridge(**best_param)

In [193]:
print(cross_val_score(ridge_best, X_train, y_train, cv=5))

[0.84598911 0.58825851 0.82552876 0.81682838 0.80491151]


In [194]:
ridge_best.fit(X_train, y_train)

Ridge(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [195]:
ridge_best.score(X_test, y_test)

0.7834677689573312