# Imports

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


: 

# Load Dataset

In [None]:
train = pd.read_csv("train.csv")
train.drop("Id", axis = 1, inplace = True)

: 

# Log Transform

In [None]:
# Log transform the target as described by the competition guidelines
train['SalePrice'] = np.log1p(train['SalePrice'])
y = train['SalePrice']

: 

# Fill Missing Values

In [None]:
# Identify numerical and categorical features and missing values
numerical_features = train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")
categorical_features = train.select_dtypes(include = ["object"]).columns
train_numerical_features = train[numerical_features]
train_categorical_features = train[categorical_features]

print('Number of missing numerical values: ' + str(train_numerical_features.isnull().values.sum()))
print('Number of missing categorical values: ' + str(train_categorical_features.isnull().values.sum()))

: 

In [None]:
# Use median to fill in missing values for numerical features
train_numerical_features = train_numerical_features. \
  fillna(train_numerical_features.median())
# Use one hot encoding to fill in missing values for categorical features
train_categorical_features = pd.get_dummies(train_categorical_features)

print('Number of missing numerical values: ' + str(train_numerical_features.isnull().values.sum()))
print('Number of missing categorical values: ' + str(train_categorical_features.isnull().values.sum()))

: 

# Train-Test Split

In [None]:
# Split dataset into train and test
train = pd.concat([train_numerical_features, train_categorical_features], \
                  axis = 1)
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, \
                                                    random_state = 42)

: 

# (1) Linear Regression - Baseline

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_preds = lr.predict(X_train)
y_test_preds = lr.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (1) Linear Regression - Improved

In [None]:
params = {"fit_intercept": [True, False],
          "copy_X": [True, False],
          "positive": [True, False]
         }

improved_lr = LinearRegression()

grid_lr = RandomizedSearchCV(estimator=improved_lr, 
                             param_distributions=params, 
                             cv = 2, 
                             n_jobs=-1)

grid_lr.fit(X_train, y_train)

: 

In [None]:
grid_lr.best_params_


: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, \
                                                    random_state = 40)

best_lr = LinearRegression(positive=False, fit_intercept=True, copy_X=True)
best_lr.fit(X_train, y_train)

y_train_preds = best_lr.predict(X_train)
y_test_preds = best_lr.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))


: 

# (2) Random Forest - Baseline

In [None]:
rf = RandomForestRegressor(random_state=155)

rf.fit(X_train, y_train)

y_train_preds = rf.predict(X_train)
y_test_preds = rf.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))


: 

# (2) Random Forest - Improved

In [None]:
rf_grid = {"n_estimators": np.arange(10, 2000, 10),
           "max_depth": [None, 2, 4, 6, 8, 10],
           "min_samples_split": np.arange(2, 10, 2),
           "min_samples_leaf": np.arange(1, 10, 2),
           "max_features": [1.0, "sqrt", "log2", None],
           "max_leaf_nodes": [None, 2, 4, 6, 8, 10]
           }

randomized_search_rf_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_jobs=-1,
                              random_state=42,
                              n_iter=50,
                              scoring='r2',
                              cv=5,
                              verbose=True)

randomized_search_rf_model.fit(X_train, y_train)

: 

In [None]:
randomized_search_rf_model.best_params_

: 

In [None]:
best_rf_model = RandomForestRegressor(
                           random_state=30,
                           n_estimators=150,
                           min_samples_split=4,
                           min_samples_leaf=7,
                           max_leaf_nodes=None,
                           max_features=1.0,
                           max_depth=8
                             )

train = pd.concat([train_numerical_features, train_categorical_features], \
                  axis = 1)
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.2, \
                                                    random_state = 49)

best_rf_model.fit(X_train, y_train)

y_train_preds = best_rf_model.predict(X_train)
y_test_preds = best_rf_model.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (3) XGBoost - Baseline

In [None]:
xgb_baseline = xgb.XGBRegressor()

xgb_baseline.fit(X_train, y_train)

y_train_preds = xgb_baseline.predict(X_train)
y_test_preds = xgb_baseline.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (3) XGBoost - Improved

In [None]:
# Hyperparameter tuning
# https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

dmatrix = xgb.DMatrix(data=X_train, label=y_train)

params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]
         }

xgbr = xgb.XGBRegressor()

clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         n_jobs=1,
                         scoring='r2',
                         cv=5,
                         verbose=1)

clf.fit(X_train, y_train)

: 

In [None]:
clf.best_params_

: 

In [None]:
# Create best decision tree regression model, make predictions, and print metrics
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html
# https://scikit-learn.org/stable/modules/model_evaluation.html#

best_xgbr_model = xgb.XGBRegressor(colsample_bytree=0.3,
                                   learning_rate=0.01,
                                   max_depth=10,
                                   n_estimators=1000)

best_xgbr_model.fit(X_train, y_train)

y_train_preds = best_xgbr_model.predict(X_train)
y_test_preds = best_xgbr_model.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (4) LightGBM - Baseline

In [None]:
lgbModel = lgb.LGBMRegressor()
lgbModel.fit(X_train, y_train)
y_train_prediction = lgbModel.predict(X_train)
y_test_prediction = lgbModel.predict(X_test)
# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_prediction))
print('MSE (test):', mean_squared_error(y_test, y_test_prediction))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_prediction)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_prediction)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_prediction))
print('MAE (test):', mean_absolute_error(y_test, y_test_prediction))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_prediction))
print('R2 Score (test):', r2_score(y_test, y_test_prediction))


: 

# (4) LightGBM - Improved

In [None]:

lgbModel = lgb.LGBMRegressor()

params = { 
            'learning_rate': [0.025, 0.05, 0.1, 0.2],
            'num_leaves': [10, 20, 30, 40],
            'n_estimators': [50, 100, 150]
         }

randomSearchCVLGB = RandomizedSearchCV(estimator=lgbModel,
                         param_distributions=params,
                         n_jobs=1,
                         scoring='r2',
                         cv=5,
                         verbose=1)

randomSearchCVLGB.fit(X_train, y_train)
randomSearchCVLGB.best_params_

: 

In [None]:
lgbBestModel = lgb.LGBMRegressor(learning_rate=0.05, n_estimators= 100, num_leaves= 40)
lgbBestModel.fit(X_train, y_train)
y_train_prediction = lgbBestModel.predict(X_train)
y_test_prediction = lgbBestModel.predict(X_test)
# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_prediction))
print('MSE (test):', mean_squared_error(y_test, y_test_prediction))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_prediction)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_prediction)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_prediction))
print('MAE (test):', mean_absolute_error(y_test, y_test_prediction))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_prediction))
print('R2 Score (test):', r2_score(y_test, y_test_prediction))

: 

# (5) Lasso Regression - Baseline


In [None]:
lasso_regression = LassoCV()
lasso_regression.fit(X_train, y_train)

y_train_preds = lasso_regression.predict(X_train)
y_test_preds = lasso_regression.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (5) Lasso Regression - Improved



In [None]:
params = {"fit_intercept": [True, False],
          "copy_X": [True, False],
          "positive": [True, False]
         }

improved_lasso_r = LassoCV()

grid_lasso_r = RandomizedSearchCV(estimator=improved_lasso_r, 
                             param_distributions=params, 
                             cv = 2, 
                             n_jobs=-1)

grid_lasso_r.fit(X_train, y_train)

: 

In [None]:
grid_lasso_r.best_params_

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.2, \
                                                    random_state = 25)

best_lasso_r = LassoCV(positive=True, fit_intercept=True, copy_X=True)
best_lasso_r.fit(X_train, y_train)

y_train_preds = best_lasso_r.predict(X_train)
y_test_preds = best_lasso_r.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (6) Neural Network - Baseline

In [None]:
# code to disable warnings adapted from https://stackoverflow.com/questions/32612180/eliminating-warnings-from-scikit-learn
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

: 

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html
nn = MLPRegressor(random_state=42)
                        
nn.fit(X_train, y_train)

y_train_preds = nn.predict(X_train)
y_test_preds = nn.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

# (6) Neural Network - Improved

In [None]:
# Find the best hyperparameters for neural network
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

nn_grid = {"hidden_layer_sizes": [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
           "activation": ['identity', 'logistic', 'tanh', 'relu'],
           "solver": ['lbfgs','sgd', 'adam'],
           "alpha": [0.0001, 0.0005, 0.001],
           "learning_rate_init": [0.001, 0.005, 0.01],
           }

randomized_search_nn_model = RandomizedSearchCV(MLPRegressor(),
                              param_distributions=nn_grid,
                              n_jobs=-1,
                              random_state=42,
                              n_iter=50,
                              cv=5,)

randomized_search_nn_model.fit(X_train, y_train)
randomized_search_nn_model.best_params_

: 

In [None]:
# Create neural network using the best hyperparameters
best_nn_model = MLPRegressor(solver='lbfgs',
                             learning_rate_init=0.001,
                             hidden_layer_sizes=(80, ),
                             alpha=0.0001,
                             activation='logistic',
                             random_state=42
                             )
best_nn_model.fit(X_train, y_train)

y_train_preds = best_nn_model.predict(X_train)
y_test_preds = best_nn_model.predict(X_test)

# MSE
print('MSE (train):', mean_squared_error(y_train, y_train_preds))
print('MSE (test):', mean_squared_error(y_test, y_test_preds))

# RMSE
print('RMSE (train):',  np.sqrt(mean_squared_error(y_train, y_train_preds)))
print('RMSE (test):', np.sqrt(mean_squared_error(y_test, y_test_preds)))

# MAE
print('MAE (train):', mean_absolute_error(y_train, y_train_preds))
print('MAE (test):', mean_absolute_error(y_test, y_test_preds))

# R2 Score
print('R2 Score (train):', r2_score(y_train, y_train_preds))
print('R2 Score (test):', r2_score(y_test, y_test_preds))

: 

: 