In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
SEED_SET = 123
np.random.seed(SEED_SET)


In [3]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [4]:
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
def change_yearbuilt(data):
    return data.apply(lambda x: x//10 - train_data.YearBuilt.min()//10)

In [6]:
def change_yearsold(data):
    return data.apply(lambda x: x - train_data.YrSold.min())

In [7]:
train_data['YearBuilt_con'] = change_yearbuilt(train_data.YearBuilt)
test_data['YearBuilt_con'] = change_yearbuilt(test_data.YearBuilt)

In [8]:
train_data['YearSold_con'] = change_yearbuilt(train_data.YrSold)
test_data['YearSold_con'] = change_yearbuilt(test_data.YrSold)

In [9]:
con_vars = ['YearBuilt_con', 'YearSold_con',
            'FullBath', 'GarageArea', 'LotArea', 'GrLivArea', 'Fireplaces', 
            'HalfBath', 'MoSold', 'OverallCond', 'OverallQual', 'KitchenAbvGr', 'TotRmsAbvGrd']

cat_vars = ['LotShape', 'BedroomAbvGr', 'FireplaceQu', 'HouseStyle', 
            'GarageType', 'Foundation', 'HeatingQC', 'KitchenQual', 
            'Neighborhood', 'SaleCondition', 'BldgType', 'CentralAir', 'Functional', 'PavedDrive']

to_remove = ['Id', 'MSSubClass', 'BsmtCond', 'YearBuilt',
             'Alley', 'PoolQC', 'Fence', 'MiscFeature', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 
             'Street', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'RoofStyle', 'RoofMatl', 'Heating', 
             'BsmtHalfBath', 
             'GarageQual', 'GarageCond', 'GarageCars', 'GarageFinish', 'GarageYrBlt', 
             'LotFrontage', 'LowQualFinSF',
             'Condition1', 'Condition2', 'OpenPorchSF',
             'BsmtExposure','BsmtFinSF1','BsmtFinSF2','BsmtFinType1','BsmtFinType2','BsmtFullBath','BsmtQual','BsmtUnfSF', 
             'Electrical', 'EnclosedPorch', 
             'ExterCond','ExterQual','Exterior1st','Exterior2nd', 
             'MSZoning', 'MasVnrArea','MasVnrType','MiscVal', 'PoolArea', 
             'SaleType', 'ScreenPorch','TotalBsmtSF','WoodDeckSF','YearRemodAdd','YrSold']

target = ['SalePrice']

In [10]:
set(train_data.columns) - set(con_vars) - set(cat_vars) - set(to_remove) - set(target)

set()

# Feature Engineering

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [12]:
PCA_COMPONENTS = 0.9

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'constant', missing_values = np.nan, fill_value=-1)),
    ('scaler', StandardScaler()),
    # ('pca', PCA(n_components = PCA_COMPONENTS))
])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent', missing_values = np.nan)),
    ('onehot', OneHotEncoder(handle_unknown = 'error', drop = 'first'))
])

In [14]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, con_vars),
        ('cat', categorical_transformer, cat_vars)
    ]
)

In [15]:
train_data = train_data.drop(to_remove, axis = 1)
test_data = test_data.drop(to_remove, axis = 1)

In [16]:
# Finish FE

# Shuffle train_data
train_data = shuffle(train_data, random_state=SEED_SET)

# Divide train_data into train_data and valid_data
X_train, X_valid, y_train, y_valid = train_test_split(train_data[con_vars + cat_vars], train_data[target], 
                                                      test_size=0.25, random_state=SEED_SET)

# Fit preprocessor
selector = preprocessor.fit(X_train)

# Transforming training and validation datasets
X_train = selector.transform(X_train)
X_valid = selector.transform(X_valid)

# Preprocessing testing datasets
X_test = test_data[con_vars + cat_vars]
X_test = selector.transform(X_test)


# Modelling

In [17]:
import optuna
from optuna.samplers import TPESampler
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [18]:
def create_model(trial):
    
    # HP_to_O
    # parameter that shrinks the contribution of each tree by its value
    learning_rate = trial.suggest_uniform('learning_rate', 0, 0.15)
    # Maximum tree leaves for base learners
    num_leaves = trial.suggest_int('num_leaves', 100, 300)
    # Maximum number of levels in tree
    max_depth = trial.suggest_int('max_depth', 3, 8)
    # Number of trees in random forest
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    # the fraction of samples to be used for fitting the individual base learners
    subsample = trial.suggest_uniform('subsample', 0.5, 0.9) 
    # Subsample ratio of columns when constructing each tree
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.5, 0.9)
    # Regularization parameters
    reg_lambda = trial.suggest_uniform('reg_lambda', 0, 0.9)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0, 20)
    
    # Define the model with:
    reg_ = LGBMRegressor(boosting_type='gbdt', # Gradient Boosting Decision Tree
                         random_state = SEED_SET, # seed 
                         silent = False, 
                         learning_rate = learning_rate, 
                         num_leaves=num_leaves,
                         max_depth=max_depth,
                         n_estimators=n_estimators,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         reg_lambda=reg_lambda,
                         reg_alpha=reg_alpha
                        )
    
    return reg_


In [27]:
def objective(trial):
    
    model = create_model(trial)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    valid_rmse = mean_squared_error(y_true = y_valid, y_pred = y_pred, squared = False)
    
    return valid_rmse

In [43]:
sampler = TPESampler(seed=SEED_SET)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=30)

[32m[I 2020-10-31 14:21:42,044][0m A new study created in memory with name: no-name-2f24d654-03df-4e14-b960-53b8bfcd8746[0m
  y = column_or_1d(y, warn=True)
[32m[I 2020-10-31 14:21:44,286][0m Trial 0 finished with value: 29946.55347469908 and parameters: {'learning_rate': 0.10447037783967925, 'num_leaves': 226, 'max_depth': 5, 'n_estimators': 198, 'subsample': 0.7205259076331565, 'colsample_bytree': 0.7877875879142253, 'reg_lambda': 0.38079581411201485, 'reg_alpha': 19.61528396769231}. Best is trial 0 with value: 29946.55347469908.[0m
  y = column_or_1d(y, warn=True)
[32m[I 2020-10-31 14:21:45,674][0m Trial 1 finished with value: 29850.366397899594 and parameters: {'learning_rate': 0.1027244607877295, 'num_leaves': 226, 'max_depth': 4, 'n_estimators': 132, 'subsample': 0.6372712064603477, 'colsample_bytree': 0.7916198829536167, 'reg_lambda': 0.394715020211662, 'reg_alpha': 1.193557932191367}. Best is trial 1 with value: 29850.366397899594.[0m
  y = column_or_1d(y, warn=True)


In [44]:
# Final parameters
params = study.best_params

In [45]:
params

{'learning_rate': 0.053539620452168056,
 'num_leaves': 194,
 'max_depth': 6,
 'n_estimators': 156,
 'subsample': 0.7965912381040505,
 'colsample_bytree': 0.5696441624644419,
 'reg_lambda': 0.6888825984159586,
 'reg_alpha': 5.802499862796052}

In [46]:
model = LGBMRegressor(**params)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.5696441624644419, importance_type='split',
              learning_rate=0.053539620452168056, max_depth=6,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=156, n_jobs=-1, num_leaves=194, objective=None,
              random_state=None, reg_alpha=5.802499862796052,
              reg_lambda=0.6888825984159586, silent=True,
              subsample=0.7965912381040505, subsample_for_bin=200000,
              subsample_freq=0)

In [47]:
y_pred = model.predict(X_valid)

# Validation of models

In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [49]:
mean_squared_error(y_true = y_valid, y_pred = y_pred, squared = False)

27896.039072841388

# Finished Model

In [51]:
test_pred = model.predict(X_test)

In [52]:
test_pred

array([135360.76496107, 159983.62069084, 180040.0621394 , ...,
       151490.21554895, 127927.56244332, 213674.60570128])

In [53]:
submission = pd.read_csv('sample_submission.csv')

In [54]:
submission.SalePrice = test_pred

In [55]:
submission[['Id', 'SalePrice']].to_csv('submission.csv', index = False)