In [1]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

**References**
> https://www.kaggle.com/code/bextuychiev/lasso-regression-with-pipelines-tutorial

> https://www.kaggle.com/code/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv

In [2]:
prject_folder = "/Users/richardxu/Dropbox/UIUC_CS598_Statistical_Learning/Project1/proj1/fold6"
df_train = pd.read_csv(os.path.join(prject_folder, "train.csv"))
df_test_x = pd.read_csv(os.path.join(prject_folder, "test.csv"))
df_test_y = pd.read_csv(os.path.join(prject_folder, "test_y.csv"))

In [3]:
df_train_y = pd.DataFrame()
df_train_y['Sale_Price'] = df_train['Sale_Price'].copy()
df_train_x = df_train.drop(columns=['PID', 'Sale_Price'])
df_test_x.drop(columns=['PID'], inplace=True)
df_test_y.drop(columns=['PID'], inplace=True)

### Step 1. Missing value imputation

In [4]:
# Impute missing values in "Garage_Yr_Blt" variable with 0
df_train_x['Garage_Yr_Blt'].fillna(0, inplace=True)
df_test_x['Garage_Yr_Blt'].fillna(0, inplace=True)

### Step 2. Features to Remove

In [5]:
# The following features are either highly imbalanced or not informative
remove_features_set = ['Condition_2', 'Heating', 'Latitude', 'Longitude', 'Low_Qual_Fin_SF',
                       'Misc_Feature','Pool_Area','Pool_QC','Roof_Matl','Street','Utilities']
df_train_x.drop(columns=remove_features_set, inplace=True)
df_test_x.drop(columns=remove_features_set, inplace=True)

### Step 3. Winsorization

In [6]:
winsor_features_set = ['BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Enclosed_Porch', 'First_Flr_SF',
                 'Garage_Area', 'Gr_Liv_Area', 'Lot_Area', 'Lot_Frontage','Mas_Vnr_Area',
                 'Misc_Val', 'Open_Porch_SF', 'Screen_Porch', 'Second_Flr_SF', 'Three_season_porch',
                 'Total_Bsmt_SF', 'Wood_Deck_SF']

In [7]:
for val in winsor_features_set:
    upper_limit = df_train[val].quantile(0.95)
    df_train_x[val] = df_train_x[val].apply(lambda x: upper_limit if x > upper_limit else x)

### Step 4. Categorical feature transformation using one-hot encoder

In [8]:
# Treating the two features "Mo_Sold" (1~12), and "Year_Sold" (2006~2010) as categorical
# variables can improve model performance
df_train_x['Mo_Sold'] = df_train_x['Mo_Sold'].values.astype('object')
df_test_x['Mo_Sold'] = df_test_x['Mo_Sold'].values.astype('object')
df_train_x['Year_Sold'] = df_train_x['Year_Sold'].values.astype('object')
df_test_x['Year_Sold'] = df_test_x['Year_Sold'].values.astype('object')

In [9]:
def categorical_variable_transform(train_df, test_df):
    # IMPORTANT:
    # The test_dataframe needs to use the encoder from the trainng_dataframe, because some categories might be
    # missing in the test data
    
    categorical_feature_set = [feature for feature in train_df.columns if train_df[feature].dtypes=='object']

    for feature in categorical_feature_set:
        encoder = OneHotEncoder(handle_unknown='ignore')
        train_category_matrix = [[element] for element in train_df[feature]]
        test_category_matrix = [[element] for element in test_df[feature]]

        encoder.fit(train_category_matrix)
        train_df_hot_code = pd.DataFrame(encoder.transform(train_category_matrix).toarray())
        test_df_hot_code = pd.DataFrame(encoder.transform(test_category_matrix).toarray())

        train_df_hot_code.columns = [feature + '_' + str(c) for c in train_df_hot_code.columns]
        test_df_hot_code.columns = [feature + '_' + str(c) for c in test_df_hot_code.columns]


        # Replace the original feature with one-hot encoded feature
        train_df.drop(columns=feature, inplace=True)
        train_df = pd.concat([train_df, train_df_hot_code], axis=1)
        test_df.drop(columns=feature, inplace=True)
        test_df = pd.concat([test_df, test_df_hot_code], axis=1)


    return train_df, test_df

In [10]:
df_train_x_trans, df_test_x_trans =  categorical_variable_transform(df_train_x, df_test_x)

### Build Models

In [11]:
# Log-scale sale_price
df_train_y['Sale_Price'] =  df_train_y['Sale_Price'].apply(lambda y: np.log(y))
df_test_y['Sale_Price'] =  df_test_y['Sale_Price'].apply(lambda y: np.log(y))

### Lasso Linear Regression

In [12]:
lasso = Lasso(fit_intercept=True, random_state=0, max_iter=10000)
lasso_cv_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", lasso)])
alphas = np.logspace(-5, 0.1, 20)
param_grid = {"lasso__alpha":alphas}
n_folds = 10
lasso_clf = GridSearchCV(lasso_cv_pipeline, param_grid, cv=n_folds, refit=False, scoring='neg_mean_squared_error')

In [13]:
lasso_clf.fit(df_train_x_trans , df_train_y)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [14]:
best_alpha = lasso_clf.best_params_['lasso__alpha']
print(best_alpha)

0.0026048905108264305


In [15]:
best_lasso = Lasso(fit_intercept=True, random_state=0, alpha=best_alpha, max_iter=10000)
best_lasso_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", best_lasso)])

best_lasso_pipeline.fit(df_train_x_trans , df_train_y)

In [16]:
lasso_train_predict = best_lasso_pipeline.predict(df_train_x_trans)
lasso_test_predict = best_lasso_pipeline.predict(df_test_x_trans)

In [17]:
lasso_rmse_train = np.sqrt(np.mean(lasso_train_predict - np.squeeze(df_train_y.values))**2)
lasso_rmse_test = np.sqrt(np.mean(lasso_test_predict - np.squeeze(df_test_y.values))**2)

In [18]:
print("Lasso Linear Regression: Root mean squared error of train result is {}".format(lasso_rmse_train))
print("Lasso Linear Regression: Root mean squared error of test result is {}".format(lasso_rmse_test))

Lasso Linear Regression: Root mean squared error of train result is 6.599629018152076e-16
Lasso Linear Regression: Root mean squared error of test result is 0.018549771255439863


### XGBoost model

In [19]:
xgb = XGBRegressor()
xgb_cv_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("xgb", xgb)])
param_grid = {"xgb__max_depth":[2, 3, 4, 5, 6],
              "xgb__n_estimators":[200, 400, 600, 800, 1000]}

n_folds = 10
xgb_clf = GridSearchCV(xgb_cv_pipeline, param_grid, cv=n_folds, scoring='neg_mean_squared_error')
xgb_clf.fit(df_train_x_trans , df_train_y)

In [20]:
best_params_xgb = xgb_clf.best_params_
print(best_params_xgb)

{'xgb__max_depth': 2, 'xgb__n_estimators': 400}


In [21]:
best_xgb = XGBRegressor(max_depth=3, n_estimators=400)
best_xgb_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("xgb", best_xgb)])

best_xgb_pipeline.fit(df_train_x_trans , df_train_y)

In [22]:
xgb_train_predict = best_xgb_pipeline.predict(df_train_x_trans)
xgb_test_predict = best_xgb_pipeline.predict(df_test_x_trans)

In [23]:
xgb_rmse_train = np.sqrt(np.mean(xgb_train_predict - np.squeeze(df_train_y.values))**2)
xgb_rmse_test = np.sqrt(np.mean(xgb_test_predict - np.squeeze(df_test_y.values))**2)

In [24]:
print("XGBoost: Root mean squared error of train result is {}".format(xgb_rmse_train))
print("XGBoost: Root mean squared error of test result is {}".format(xgb_rmse_test))

XGBoost: Root mean squared error of train result is 7.828190586791124e-06
XGBoost: Root mean squared error of test result is 0.002091242407641755
