In [28]:
import shap
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from catboost import Pool
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

In [29]:
train_test_dummy=pd.read_csv('train_test_dummy.csv')
train_test=pd.read_csv('train_test.csv')

In [30]:
train_test.iloc[3999:4005]

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SqFtPerRoom,Total_Home_Quality,Total_Bathrooms,HighQualSF,renovated
3999,1459,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,6,2008,WD,Normal,139.555556,56,2.5,1256,3930
4000,0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,6,2010,WD,Normal,128.0,56,1.0,896,3922
4001,1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,12500,6,2010,WD,Normal,147.666667,66,1.5,1329,3916
4002,2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,3,2010,WD,Normal,162.9,55,2.5,1629,3995
4003,3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,6,2010,WD,Normal,145.818182,66,2.5,1604,3996
4004,4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,1,2010,WD,Normal,160.0,85,2.0,1280,3984


In [31]:
# Train-Test separation

X_train = train_test_dummy[0:4000]
X_test = train_test_dummy[4000:]

# Creation of the RMSE metric:
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, target_log, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [32]:
target_log=pd.read_csv('target_log.csv')

In [33]:
target_log.head()

Unnamed: 0.1,Unnamed: 0,SalePrice
0,0,12.278398
1,1,11.561725
2,2,12.055256
3,3,12.404928
4,4,12.154258


In [34]:
target_log=target_log['SalePrice']

In [35]:
#checking for nan values in training set
nan=pd.DataFrame(X_train.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/4000)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

Unnamed: 0,Serial No.,Nan_sum,feat,Percentage,Perc


In [36]:
#checking for nan values in test set
nan=pd.DataFrame(X_test.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/2919)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

Unnamed: 0,Serial No.,Nan_sum,feat,Percentage,Perc


In [37]:
# 10 Fold Cross validation

kf = KFold(n_splits=10, random_state=42, shuffle=True) # any integer

# K-Folds cross-validator: Provides train/test indices to split data in train/test sets. 
# please read the material: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

baseline_models = ['Linear_Reg.','Bayesian_Ridge_Reg.','LGBM_Reg.','SVR',
                   'Dec_Tree_Reg.','Random_Forest_Reg.',
                   'Grad_Boost_Reg.','Cat_Boost_Reg.','XGB_Reg.', 'ridge','lasso']

In [38]:
# Linear Regression

lreg = LinearRegression()
score_lreg = cv_rmse(lreg)
print(score_lreg.mean())
print(score_lreg.std())

397.234802346573
487.286060841141


In [39]:
# every time you build a model, please make prediction on test dataset and make a submission.
submission = pd.read_csv("sample_submission.csv")
lreg_model=lreg.fit(X_train,target_log)
submission.iloc[:,1] = np.floor(np.expm1(lreg_model.predict(X_test)))
print(submission.head())
submission.to_csv("sub_lreg.csv", index=False)  #0.11396

     Id  SalePrice
0  1461   121527.0
1  1462   167530.0
2  1463   183443.0
3  1464   201251.0
4  1465   193112.0


In [40]:
# Bayesian Ridge Regression

brr = BayesianRidge(compute_score=True)
score_brr = cv_rmse(brr)
#https://scikit-learn.org/stable/auto_examples/linear_model/plot_bayesian_ridge.html
print(score_brr.mean())
print(score_brr.std())

0.1178617846171008
0.01880477771254799


In [41]:
submission = pd.read_csv("sample_submission.csv")
brr_model=brr.fit(X_train,target_log)
submission.iloc[:,1] = np.floor(np.expm1(brr_model.predict(X_test)))
print(submission.head())
submission.to_csv("sub_brr.csv", index=False) 

     Id  SalePrice
0  1461   118189.0
1  1462   145098.0
2  1463   180003.0
3  1464   198524.0
4  1465   193294.0


In [42]:
# try ridge and lasso regression, use grid search to find the best alpha value for the two regression
# print the cv mean and std
# make submission on test dataset, see the submission socre.

In [43]:
# Light Gradient Boost Regressor
#https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

l_gbm = LGBMRegressor(objective='regression')
score_l_gbm = cv_rmse(l_gbm)
print(score_l_gbm.mean())
print(score_l_gbm.std())

nan
nan


In [44]:
# bayesian optimization
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
from math import sqrt
import lightgbm as lgb

ModuleNotFoundError: No module named 'bayes_opt'

In [None]:
# you can change values for learning rate (float), bagging_freq (any integer), bagging_seed (any integer) and feature_fraction_seed (any integer)
# you can control init_round (any integer) and opt_round (any integer)
def bayes_parameter_opt_lgb(X, y, init_round=3, opt_round=17, n_folds=10, random_seed=42, n_estimators=5000, learning_rate=0.01,bagging_freq=5,bagging_seed=7,feature_fraction_seed=7):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y)
    # parameters
    def lgb_eval(num_leaves, max_depth, bagging_fraction, feature_fraction,min_data_in_leaf):
        params = {'objective':'regression','boosting_type': 'gbdt', 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate,
                  'bagging_freq':bagging_freq,'bagging_seed':bagging_seed, 'feature_fraction_seed':feature_fraction_seed}
        params["num_leaves"] = int(round(num_leaves))
        params['max_depth']=int(round(max_depth))
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params["min_data_in_leaf"] = int(round(min_data_in_leaf))
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval=500, metrics=['rmse'],early_stopping_rounds=50)
        return -1.0 * np.min(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (1, 200),  # you can try different value range 
                                            'max_depth':(1,100),  # you can try different value range 
                                            'min_data_in_leaf':(1,100),  # you can try different value range 
                                            'bagging_fraction' : (0.1,0.9), # you can try different value range 
                                            'feature_fraction':(0.1,0.9)}) # you can try different value range 
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

opt_params = bayes_parameter_opt_lgb(X_train, target_log, init_round=3, opt_round=17, n_folds=10, random_seed=42, n_estimators=5000, learning_rate=0.01,
                                    bagging_freq=5,bagging_seed=7,feature_fraction_seed=7)

In [None]:
l_gbm2 = LGBMRegressor(objective='regression',boosting_type='gbdt', verbose= -1,\
                      num_boost_round=5000, learning_rate=0.01,\
                      bagging_freq=5,bagging_seed=7, feature_fraction_seed=7,\
                     num_leaves=48,max_depth=65,bagging_fraction=0.9,feature_fraction=0.1,\
                     min_data_in_leaf=12)
score_2_gbm = cv_rmse(l_gbm2)
print(score_2_gbm.mean())
print(score_2_gbm.std())

In [None]:
# plot feature importance for lgb


In [None]:
# compare score2 with score1, choose the model with lower mean for submission.
# score 1 mean:0.09810103875246606
# score 2 mean: 0.08162113750825215
# score 2 is better, choose second model

In [None]:
submission = pd.read_csv("sample_submission.csv")
l_gbm_model=l_gbm2.fit(X_train,target_log)
submission.iloc[:,1] = np.floor(np.expm1(l_gbm_model.predict(X_test)))
print(submission.head())
submission.to_csv("l_gbm_brr.csv", index=False) 

In [None]:
# Support Vector Regression
# https://towardsdatascience.com/an-introduction-to-support-vector-regression-svr-a3ebc1672c2?gi=bad5aefbad71
svr = SVR()
score_svr = cv_rmse(svr)
print(score_svr.mean())
print(score_svr.std())
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [None]:
# grid search
from sklearn.model_selection import GridSearchCV
grid_para_svm = [{'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4]}, \
                 {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, \
                 {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]

In [None]:
grid_search_svm = GridSearchCV(svr, grid_para_svm, scoring = 'neg_mean_squared_error', cv=10)
grid_search_svm.fit(X_train, target_log)

In [None]:
grid_search_svm.best_params_

In [None]:
svr2 = SVR(kernel='poly',C=1000, degree=4)
# based on the result of grid search
score2_svr = cv_rmse(svr2)
print(score2_svr.mean())
print(score2_svr.std())

In [None]:
# compare score2 with score1, choose the model with lower mean for submission.
# score 1:0.26649
# score 2: 0.1686

In [None]:
submission = pd.read_csv("sample_submission.csv")
svr_model=svr2.fit(X_train,target_log)
submission.iloc[:,1] = np.floor(np.expm1(svr_model.predict(X_test)))
print(submission.head())
submission.to_csv("sub_svr.csv", index=False) 

In [None]:
# Decision Tree Regressor
dtr = DecisionTreeRegressor()
score_dtr = cv_rmse(dtr)
print(score_dtr.mean())
print(score_dtr.std())

In [None]:
# please use grid search to decide the tuning parameters for decision tree regressor
# plot the feature importance
# choose the best model for submission

In [None]:
# Random Forest Regressor

rfr = RandomForestRegressor()
score_rfr = cv_rmse(rfr)
print(score_rfr.mean())
print(score_rfr.std())

In [None]:
# please use grid search to decide the tuning parameters for  Random Forest Regressor
# plot the feature importance
# choose the best model for submission

In [None]:
# Gradient Boost Regressor

gbr = GradientBoostingRegressor()
score_gbr = cv_rmse(gbr)
print(score_gbr.mean())
print(score_gbr.std())

In [None]:
# please use grid search to decide the tuning parameters for  Gradient Boost Regressor
# plot the feature importance
# choose the best model for submission

In [None]:
# Cat Boost Regressor

catb = CatBoostRegressor()
score_catb = cv_rmse(catb)
#https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
print(score_catb.mean())
print(score_catb.std())

In [None]:
submission = pd.read_csv("sample_submission.csv")
catb_model=catb.fit(X_train,target_log)
submission.iloc[:,1] = np.floor(np.expm1(catb_model.predict(X_test)))
print(submission.head())
submission.to_csv("sub_catb.csv", index=False) 

In [None]:
# XGB Regressor

xgb = XGBRegressor()
score_xgb = cv_rmse(xgb)
print(score_xgb.mean())
print(score_xgb.std())

In [None]:
# please use bayesian optimization to tune hyperparameters of xgb
#  plot the feature importance
# choose the best model for submission

In [None]:
# arrange models' order based score from lowest to highest
# arrange models' order based submission score from lowest to highest