In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
def drop_id(ds):
    return ds.drop(columns='Id')


def drop_nans(train, threshold):
    new_train = train.copy()
    
    for feature in new_train.columns:
        nans = new_train[feature].isnull().sum()
        if (nans / train.shape[0] > threshold):
            new_train = new_train.drop(columns=feature)    
            
    return new_train


def replace_nans(train, method='ffill'):
    return train.copy().fillna(method=method)


def encode_features(train):
    new_train = train.copy()
    l_encoder = LabelEncoder()
    
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = l_encoder.fit_transform(new_train[feature])
            
    return new_train

# Data preprocessing

In [3]:
train = pd.read_csv('data/train.csv')

train = train.drop_duplicates()
train = drop_id(train)
train = drop_nans(train, threshold=0.2)
train = replace_nans(train)
print("nan count: ", train.isnull().sum().max())

train = encode_features(train)
train

nan count:  0


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,62.0,7917,1,3,3,0,4,0,...,0,0,0,0,0,8,2007,8,4,175000
1456,20,3,85.0,13175,1,3,3,0,4,0,...,0,0,0,0,0,2,2010,8,4,210000
1457,70,3,66.0,9042,1,3,3,0,4,0,...,0,0,0,0,2500,5,2010,8,4,266500
1458,20,3,68.0,9717,1,3,3,0,4,0,...,112,0,0,0,0,4,2010,8,4,142125


In [4]:
y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

# skilearn.RandomForest

In [9]:
params = {
    'max_depth': (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500),
    'max_features':('auto', 'sqrt', 'log2')
}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, params)
gs_model.fit(X_train[: 300], y_train[: 300])
print(f"best params: {gs_model.best_params_}")

best params: {'max_depth': 400, 'max_features': 'sqrt'}


In [10]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
print("mean_squared_log_error: ", np.sqrt(metrics.mean_squared_log_error(model.predict(X_test), y_test)))

mean_squared_log_error:  0.013115388831395455


# XGBoost

In [11]:
params = {
    "learning_rate": (0.05, 0.1),
    "max_depth": [5, 6],
    "min_child_weight": [1, 2],
    "gamma":[0.0, 0.1],
    "colsample_bytree":[0.3, 0.5]
}

model = XGBRegressor()
gs_model = GridSearchCV(model, params)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'colsample_bytree': 0.3, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 2}


In [12]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
print("mean_squared_log_error: ", np.sqrt(metrics.mean_squared_log_error(model.predict(X_test), y_test)))

mean_squared_log_error:  0.010740966445025148


# catboost

In [17]:
model = CatBoostRegressor()

params = {
        'learning_rate': [0.05, 0.1],
        'depth': [6, 10],
        'l2_leaf_reg': [1, 3],
        'logging_level': ['Silent']
}

gs_model = GridSearchCV(model, params, n_jobs=-1, cv=3)
gs_model.fit(X_train[:200], y_train[:200])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.05, 'logging_level': 'Silent'}


In [18]:
model = CatBoostRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
print("mean_squared_log_error: ", np.sqrt(metrics.mean_squared_log_error(model.predict(X_test), y_test)))

mean_squared_log_error:  0.01116565304279612


# lightgbm

In [21]:
model = LGBMRegressor()

params = {
    'num_leaves': [i for i in range(3, 20)],
    'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
    'max_depth': [5, 6, 7, 8],
    'n_estimators': [50, 100, 200, 300, 400, 500],}

gs_model = GridSearchCV(model, params, n_jobs=-1, cv=3)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'num_leaves': 3}


In [22]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
print("mean_squared_log_error: ", np.sqrt(metrics.mean_squared_log_error(model.predict(X_test), y_test)))

mean_squared_log_error:  0.011043907457434415
