### Feature Engineering :

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("../rawdata/train.csv")
test = pd.read_csv("../rawdata/test.csv")

x = train.drop(["SalePrice", "Id"], axis=1).copy()
y = train["SalePrice"].copy()

x_train_full, x_val_full, y_train, y_val = train_test_split(x, y, train_size=0.8)

numerical_cols = [c for c in x_train_full.columns if x_train_full[c].dtype in ['int64', 'float64']]
categorical_cols = [c for c in x_train_full.columns if x_train_full[c].dtype=='object']
low_cardinality_cols = [c for c in x_val_full.columns if x_val_full[c].nunique()<10 and x_val_full[c].dtype=="object"]

In [2]:
x_train_full.isnull().sum().sort_values(ascending=False)[x_train_full.isnull().sum()>0]/x_train_full.shape[0]*100

PoolQC          99.657534
MiscFeature     96.061644
Alley           93.664384
Fence           80.907534
FireplaceQu     48.287671
LotFrontage     17.893836
GarageCond       5.736301
GarageType       5.736301
GarageYrBlt      5.736301
GarageFinish     5.736301
GarageQual       5.736301
BsmtExposure     2.654110
BsmtFinType2     2.654110
BsmtFinType1     2.568493
BsmtCond         2.568493
BsmtQual         2.568493
MasVnrArea       0.599315
MasVnrType       0.599315
Electrical       0.085616
dtype: float64

#### Baseline "RandomForest" Model (MVI/One Hot Encoding/GridSearchCV) :

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

x_train = x_train_full[numerical_cols + low_cardinality_cols].copy()
x_val = x_val_full[numerical_cols + low_cardinality_cols].copy()

num_imputer = SimpleImputer() #index is included below as it was getting reset in the new dataframe
data1_train = pd.DataFrame(num_imputer.fit_transform(x_train[numerical_cols]), index=x_train.index, columns=numerical_cols)
data1_val = pd.DataFrame(num_imputer.transform(x_val[numerical_cols]), index=x_val.index, columns=numerical_cols)

char_imputer = SimpleImputer(strategy='most_frequent') #index is included below as it was getting reset in the new dataframe
data2_train = pd.DataFrame(char_imputer.fit_transform(x_train[low_cardinality_cols]), index=x_train.index, columns=low_cardinality_cols)
data2_val = pd.DataFrame(char_imputer.transform(x_val[low_cardinality_cols]), index=x_val.index, columns=low_cardinality_cols)

oh_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
data2_train = pd.DataFrame(oh_encoder.fit_transform(data2_train), index=data2_train.index)
data2_val = pd.DataFrame(oh_encoder.transform(data2_val), index=data2_val.index)

x_train = pd.concat([data1_train, data2_train], axis=1)
x_val = pd.concat([data1_val, data2_val], axis=1)

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

rfmodel = RandomForestRegressor()
rfmodel.fit(x_train, y_train)
predicted = rfmodel.predict(x_val)
print("MAE for Random Forest(20%validation data) :",mean_absolute_error(y_val, predicted))

x_full = pd.concat([x_train, x_val])
y_full = pd.concat([y_train, y_val])
scores = -1*cross_val_score(RandomForestRegressor(), x_full, y_full, cv=5, scoring='neg_mean_absolute_error')
print("MAE for Random Forest(5foldCV) :", scores.mean())

MAE for Random Forest(20%validation data) : 18149.980787671237
MAE for Random Forest(5foldCV) : 17710.306232876716


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators' : [50, 75, 100, 125, 150, 200],
    'criterion' : ['mae'],
    'max_depth' : [4, 5, 6, 7, 8],
    'max_features' : ['auto', 'sqrt', 'log2']
}

rfreg = RandomForestRegressor()

cv_rfreg = GridSearchCV(estimator=rfreg, param_grid=param_grid, cv=5)
cv_rfreg.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [11]:
cv_rfreg.best_params_

{'criterion': 'mae',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 50}

In [23]:
test_p1 = pd.DataFrame(num_imputer.transform(test[numerical_cols]), index=test.index, columns=numerical_cols)
test_p2 = pd.DataFrame(char_imputer.transform(test[low_cardinality_cols]), index=test.index, columns=low_cardinality_cols)
test_p2 = pd.DataFrame(oh_encoder.transform(test_p2), index=test_p2.index)
test_v2 = pd.concat([test_p1, test_p2], axis=1)

test_pred = cv_rfreg.best_estimator_.predict(test_v2)
submission = pd.DataFrame({"Id" : test.Id,
                          "SalePrice" : test_pred})
submission.to_csv("../rawdata/submission_v1n2.csv", index=False)
#Kaggle Score : 0.15692

#### Baseline "XGBoost" Model (MVI/One Hot Encoding/GridSearchCV) :

In [27]:
from xgboost import XGBRegressor

xgbmodel = XGBRegressor(objective='reg:squarederror')
xgbmodel.fit(x_train, y_train)
predicted = xgbmodel.predict(x_val)
print("MAE for xgb(20%validation data) :",mean_absolute_error(y_val, predicted))

x_full = pd.concat([x_train, x_val])
y_full = pd.concat([y_train, y_val])
scores = -1*cross_val_score(XGBRegressor(objective='reg:squarederror'), x_full, y_full, cv=5, scoring='neg_mean_absolute_error')
print("MAE for xgb(5foldCV) :", scores.mean())

MAE for xgb(20%validation data) : 17424.220756635274
MAE for xgb(5foldCV) : 16353.975989940067


In [28]:
param_grid = {
    'n_estimators' : [50, 75, 100, 150],
    'max_depth' : [3, 4, 5, 6],
    'learning_rate' : [.05, .1],
    'objective' : ['reg:squarederror'],
    'n_jobs' : [3]    
}

xgbmodel = XGBRegressor()

cv_xgb = GridSearchCV(estimator=xgbmodel, param_grid=param_grid, cv=5)
cv_xgb.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.05, 0.1],
                         'max_depth': [3, 4, 5, 6],
                         

In [29]:
cv_xgb.best_params_

{'learning_rate': 0.05,
 'max_depth': 6,
 'n_estimators': 150,
 'n_jobs': 3,
 'objective': 'reg:squarederror'}

In [30]:
predicted = cv_xgb.best_estimator_.predict(x_val)
print("MAE for xgb(gridsearchcv) :",mean_absolute_error(y_val, predicted))

MAE for xgb(gridsearchcv) : 17199.011584974316


In [31]:
test_p1 = pd.DataFrame(num_imputer.transform(test[numerical_cols]), index=test.index, columns=numerical_cols)
test_p2 = pd.DataFrame(char_imputer.transform(test[low_cardinality_cols]), index=test.index, columns=low_cardinality_cols)
test_p2 = pd.DataFrame(oh_encoder.transform(test_p2), index=test_p2.index)
test_v2 = pd.concat([test_p1, test_p2], axis=1)

test_pred = cv_xgb.best_estimator_.predict(test_v2)
submission = pd.DataFrame({"Id" : test.Id,
                          "SalePrice" : test_pred})
submission.to_csv("../rawdata/submission_v1n2(2).csv", index=False)
#Kaggle Rank : 2618/5099 (Score : 0.13876)

#### Baseline "LightGBM" model (MVI/No Encoding/GridSearchCV):

In [7]:
x_train = x_train_full[numerical_cols + categorical_cols].copy()
x_val = x_val_full[numerical_cols + categorical_cols].copy()

num_imputer = SimpleImputer() #index is included below as it was getting reset in the new dataframe
data1_train = pd.DataFrame(num_imputer.fit_transform(x_train[numerical_cols]), index=x_train.index, columns=numerical_cols)
data1_val = pd.DataFrame(num_imputer.transform(x_val[numerical_cols]), index=x_val.index, columns=numerical_cols)

char_imputer = SimpleImputer(strategy='most_frequent') #index is included below as it was getting reset in the new dataframe
data2_train = pd.DataFrame(char_imputer.fit_transform(x_train[categorical_cols]), index=x_train.index, columns=categorical_cols)
data2_val = pd.DataFrame(char_imputer.transform(x_val[categorical_cols]), index=x_val.index, columns=categorical_cols)

x_train = pd.concat([data1_train, data2_train], axis=1)
x_val = pd.concat([data1_val, data2_val], axis=1)

In [16]:
import lightgbm as lgb

for c in categorical_cols:
    x_train[c]=x_train[c].astype('category')
    x_val[c]=x_val[c].astype('category')

lgb_train = lgb.Dataset(x_train, label=y_train)
lgb_val = lgb.Dataset(x_val, label=y_val)

lparam = {'objective':'mape', 'min_data_in_leaf':30}

lgbmodel = lgb.train(params=lparam, train_set=lgb_train, num_boost_round=1000, valid_sets=lgb_val, early_stopping_rounds=10)

[1]	valid_0's mape: 0.258664
Training until validation scores don't improve for 10 rounds
[2]	valid_0's mape: 0.23897
[3]	valid_0's mape: 0.221476
[4]	valid_0's mape: 0.206401
[5]	valid_0's mape: 0.19533
[6]	valid_0's mape: 0.184278
[7]	valid_0's mape: 0.175184
[8]	valid_0's mape: 0.167298
[9]	valid_0's mape: 0.158971
[10]	valid_0's mape: 0.15144
[11]	valid_0's mape: 0.145381
[12]	valid_0's mape: 0.139351
[13]	valid_0's mape: 0.134357
[14]	valid_0's mape: 0.130369
[15]	valid_0's mape: 0.126256
[16]	valid_0's mape: 0.122518
[17]	valid_0's mape: 0.119654
[18]	valid_0's mape: 0.117447
[19]	valid_0's mape: 0.115606
[20]	valid_0's mape: 0.11349
[21]	valid_0's mape: 0.111842
[22]	valid_0's mape: 0.110432
[23]	valid_0's mape: 0.109263
[24]	valid_0's mape: 0.108022
[25]	valid_0's mape: 0.107098
[26]	valid_0's mape: 0.105714
[27]	valid_0's mape: 0.105039
[28]	valid_0's mape: 0.104249
[29]	valid_0's mape: 0.103229
[30]	valid_0's mape: 0.102157
[31]	valid_0's mape: 0.10176
[32]	valid_0's mape: 0.

In [18]:
test_p1 = pd.DataFrame(num_imputer.transform(test[numerical_cols]), index=test.index, columns=numerical_cols)
test_p2 = pd.DataFrame(char_imputer.transform(test[categorical_cols]), index=test.index, columns=categorical_cols)
test_v3 = pd.concat([test_p1, test_p2], axis=1)

for c in categorical_cols:
    test_v3[c]=test_v3[c].astype('category')

test_pred = lgbmodel.predict(test_v3[numerical_cols + categorical_cols])
submission = pd.DataFrame({"Id" : test.Id,
                          "SalePrice" : test_pred})
submission.to_csv("../rawdata/submission_v1n2(3).csv", index=False)
#Kaggle Score : 0.14571

#### Count Encoding / Target Encoding / Catboost Encoding in "LightGBM" :

In [6]:
import category_encoders as ce

x_train = x_train_full[numerical_cols + categorical_cols].copy()
x_val = x_val_full[numerical_cols + categorical_cols].copy()

num_imputer = SimpleImputer() #index is included below as it was getting reset in the new dataframe
data1_train = pd.DataFrame(num_imputer.fit_transform(x_train[numerical_cols]), index=x_train.index, columns=numerical_cols)
data1_val = pd.DataFrame(num_imputer.transform(x_val[numerical_cols]), index=x_val.index, columns=numerical_cols)

char_imputer = SimpleImputer(strategy='most_frequent') #index is included below as it was getting reset in the new dataframe
data2_train = pd.DataFrame(char_imputer.fit_transform(x_train[categorical_cols]), index=x_train.index, columns=categorical_cols)
data2_val = pd.DataFrame(char_imputer.transform(x_val[categorical_cols]), index=x_val.index, columns=categorical_cols)

x_train_temp = pd.concat([data1_train, data2_train], axis=1)
x_val_temp = pd.concat([data1_val, data2_val], axis=1)

count_enc = ce.CountEncoder()
train_ce = count_enc.fit_transform(data2_train).add_suffix('_count')
val_ce = count_enc.transform(data2_val).add_suffix('_count')
x_train1 = data1_train.join(train_ce)
x_val1 = data1_val.join(val_ce)

target_enc = ce.TargetEncoder()
train_te = target_enc.fit_transform(data2_train, y_train).add_suffix('_target')
val_te = target_enc.transform(data2_val).add_suffix('_target')
x_train2 = data1_train.join(train_te)
x_val2 = data1_val.join(val_te)

targecb_enc = ce.CatBoostEncoder()
train_cbe = targecb_enc.fit_transform(data2_train, y_train).add_suffix('_targetcb')
val_cbe = targecb_enc.transform(data2_val).add_suffix('_targetcb')
x_train3 = data1_train.join(train_cbe)
x_val3 = data1_val.join(val_cbe)

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


In [20]:
lgb_train1 = lgb.Dataset(x_train1, label=y_train)
lgb_train2 = lgb.Dataset(x_train2, label=y_train)
lgb_train3 = lgb.Dataset(x_train3, label=y_train)
lgb_val1 = lgb.Dataset(x_val1, label=y_val)
lgb_val2 = lgb.Dataset(x_val2, label=y_val)
lgb_val3 = lgb.Dataset(x_val3, label=y_val)

lparam = {'objective':'mape', 'min_data_in_leaf':30}

lgbmodel1 = lgb.train(params=lparam, train_set=lgb_train1, num_boost_round=1000, valid_sets=lgb_val1,
                     early_stopping_rounds=10, verbose_eval=False)
lgbmodel2 = lgb.train(params=lparam, train_set=lgb_train2, num_boost_round=1000, valid_sets=lgb_val2,
                     early_stopping_rounds=10, verbose_eval=False)
lgbmodel3 = lgb.train(params=lparam, train_set=lgb_train3, num_boost_round=1000, valid_sets=lgb_val3,
                     early_stopping_rounds=10, verbose_eval=False)

test_p1 = pd.DataFrame(num_imputer.transform(test[numerical_cols]), index=test.index, columns=numerical_cols)
test_p2 = pd.DataFrame(char_imputer.transform(test[categorical_cols]), index=test.index, columns=categorical_cols)
test_temp = pd.concat([test_p1, test_p2], axis=1)
test_ce = count_enc.transform(test_p2).add_suffix('_count')
test_te = target_enc.transform(test_p2).add_suffix('_target')
test_cbe = targecb_enc.transform(test_p2).add_suffix('_targetcb')
test_v4 = test_p1.join(test_ce)
test_v5 = test_p1.join(test_te)
test_v6 = test_p1.join(test_cbe)

test_pred4 = lgbmodel1.predict(test_v4)
test_pred5 = lgbmodel2.predict(test_v5)
test_pred6 = lgbmodel3.predict(test_v6)

submission1 = pd.DataFrame({"Id" : test.Id, "SalePrice" : test_pred4})
submission2 = pd.DataFrame({"Id" : test.Id, "SalePrice" : test_pred5})
submission3 = pd.DataFrame({"Id" : test.Id, "SalePrice" : test_pred6})
submission1.to_csv("../rawdata/submission_v1n2(4).csv", index=False)
#Kaggle Score : 0.14389
submission2.to_csv("../rawdata/submission_v1n2(5).csv", index=False)
#Kaggle Score : 0.14579
submission3.to_csv("../rawdata/submission_v1n2(6).csv", index=False)
#Kaggle Score : 0.14623

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)
