In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
SEED_SET = 123
np.random.seed(SEED_SET)


In [3]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [4]:
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
def change_yearbuilt(data):
    return data.apply(lambda x: x//10 - train_data.YearBuilt.min()//10)

In [6]:
train_data['YearBuilt_con'] = change_yearbuilt(train_data.YearBuilt)
test_data['YearBuilt_con'] = change_yearbuilt(test_data.YearBuilt)

In [7]:
con_vars = ['YearBuilt_con', 'FullBath', 'GarageArea', 'LotArea', 'GrLivArea', 'Fireplaces', 'HalfBath', 'MoSold', 'OverallCond', 'OverallQual']

cat_vars = ['LotShape', 'BedroomAbvGr', 'FireplaceQu', 'HouseStyle', 'GarageType', 'Foundation', 'HeatingQC', 'KitchenQual', 'Neighborhood', 'SaleCondition']

to_remove = ['Id', 'MSSubClass', 'BldgType', 'BsmtCond', 'YearBuilt',
             'Alley', 'PoolQC', 'Fence', 'MiscFeature', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 
             'Street', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'RoofStyle', 'RoofMatl', 'Heating', 
             'CentralAir', 'BsmtHalfBath', 'KitchenAbvGr', 'Functional', 
             'GarageQual', 'GarageCond', 'GarageCars', 'GarageFinish', 'GarageYrBlt', 
             'LotFrontage', 'LowQualFinSF',
             'PavedDrive', 'Condition1', 'Condition2', 'OpenPorchSF',
            'BsmtExposure','BsmtFinSF1','BsmtFinSF2','BsmtFinType1','BsmtFinType2','BsmtFullBath','BsmtQual','BsmtUnfSF', 
             'Electrical', 'EnclosedPorch', 
             'ExterCond','ExterQual','Exterior1st','Exterior2nd', 
             'MSZoning', 'MasVnrArea','MasVnrType','MiscVal', 'PoolArea', 
             'SaleType', 'ScreenPorch','TotRmsAbvGrd','TotalBsmtSF','WoodDeckSF','YearRemodAdd','YrSold']

target = ['SalePrice']

In [8]:
set(train_data.columns) - set(con_vars) - set(cat_vars) - set(to_remove) - set(target)

set()

# Feature Engineering

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [10]:
PCA_COMPONENTS = 0.9

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'constant', missing_values = np.nan, fill_value=-1)),
    ('scaler', StandardScaler()),
    # ('pca', PCA(n_components = PCA_COMPONENTS))
])

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent', missing_values = np.nan)),
    ('onehot', OneHotEncoder(handle_unknown = 'error', drop = 'first'))
])

In [12]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, con_vars),
        ('cat', categorical_transformer, cat_vars)
    ]
)

In [13]:
train_data = train_data.drop(to_remove, axis = 1)
test_data = test_data.drop(to_remove, axis = 1)

In [14]:
# Finish FE

# Shuffle train_data
train_data = shuffle(train_data, random_state=SEED_SET)

# Divide train_data into train_data and valid_data
X_train, X_valid, y_train, y_valid = train_test_split(train_data[con_vars + cat_vars], train_data[target], 
                                                      test_size=0.25, random_state=SEED_SET)

# Fit preprocessor
selector = preprocessor.fit(X_train)

# Transforming training and validation datasets
X_train = selector.transform(X_train)
X_valid = selector.transform(X_valid)

# Preprocessing testing datasets
X_test = test_data[con_vars + cat_vars]
X_test = selector.transform(X_test)


# Modelling

In [15]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

In [16]:
reg_ = Pipeline(steps=[('regressor', LGBMRegressor(boosting_type='gbdt', # Gradient Boosting Decision Tree
                                                   random_state = SEED_SET, # seed 
                                                   silent = False,
                                                   objective = 'regression',
                                                   n_jobs = -1))])

In [17]:
LGBMRegressor().get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])

In [18]:
# Grid search for analysis of just one parameter 

# parameter that shrinks the contribution of each tree by its value
learning_rate = [0.15]
# Maximum tree leaves for base learners
num_leaves = [750]
# Maximum number of levels in tree
max_depth = [3]
# Number of trees in random forest
n_estimators = [150]
# the fraction of samples to be used for fitting the individual base learners
subsample = [0.8]
# Subsample ratio of columns when constructing each tree
colsample_bytree = [0.6]
# Regularization parameters
reg_lambda = [0.75]
reg_alpha = [20]


# Defining a grid-search
regressor_grid_params = {
    'regressor__learning_rate': learning_rate,
    'regressor__num_leaves': num_leaves,
    'regressor__max_depth': max_depth,
    'regressor__n_estimators': n_estimators,
    'regressor__subsample': subsample,
    'regressor__colsample_bytree': colsample_bytree,
    'regressor__reg_lambda' : reg_lambda,
    'regressor__reg_alpha' : reg_alpha
}

In [19]:
grid_params = {
    **regressor_grid_params
}

In [20]:
# Define RandomizedSearch to find the best one
# Change n_folds to time-series one (var: row_id)
bst_ = RandomizedSearchCV(estimator = reg_, 
                          param_distributions = grid_params, 
                          n_jobs= -1, n_iter = 50, 
                          cv = 5, 
                          verbose = 1, 
                          random_state = SEED_SET,
                          scoring = 'neg_mean_absolute_error',
                          return_train_score = True)

In [21]:
# Fitting the data, starting the training
bst_.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   10.6s remaining:   16.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.9s finished
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('regressor',
                                              LGBMRegressor(boosting_type='gbdt',
                                                            class_weight=None,
                                                            colsample_bytree=1.0,
                                                            importance_type='split',
                                                            learning_rate=0.1,
                                                            max_depth=-1,
                                                            min_child_samples=20,
                                                            min_child_weight=0.001,
                                                            min_split_gain=0.0,
                                                            n_estimators=100,
                                           

In [22]:
bst_.cv_results_
bst_.best_params_

{'regressor__subsample': 0.8,
 'regressor__reg_lambda': 0.75,
 'regressor__reg_alpha': 20,
 'regressor__num_leaves': 750,
 'regressor__n_estimators': 150,
 'regressor__max_depth': 3,
 'regressor__learning_rate': 0.15,
 'regressor__colsample_bytree': 0.6}

In [23]:
y_pred = bst_.best_estimator_.predict(X_valid)

In [24]:
y_valid

Unnamed: 0,SalePrice
767,160000
932,320000
1239,265900
262,151000
1159,185000
...,...
590,185900
255,230000
627,153000
781,175900


# Validation of models

In [25]:
from sklearn.metrics import mean_absolute_error

In [26]:
mean_absolute_error(y_true = y_valid, y_pred = y_pred)

17763.41675942811

# Finished Model

In [27]:
test_pred = bst_.best_estimator_.predict(X_test)

In [28]:
test_pred

array([138645.83277313, 160953.75606752, 179208.02246013, ...,
       166709.62926351, 137398.31685163, 204977.4998172 ])

In [30]:
submission = pd.read_csv('sample_submission.csv')

In [32]:
submission.SalePrice = test_pred

In [7]:
submission[['Id', 'SalePrice']].to_csv('submission.csv', index = False)