In [1]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

**References**
> https://www.kaggle.com/code/bextuychiev/lasso-regression-with-pipelines-tutorial

> https://www.kaggle.com/code/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv

In [2]:
def categorical_variable_transform(train_df, test_df):
    # IMPORTANT:
    # The test_dataframe needs to use the encoder from the trainng_dataframe, because some categories might be
    # missing in the test data

    categorical_feature_set = [feature for feature in train_df.columns if train_df[feature].dtypes=='object']

    for feature in categorical_feature_set:
        encoder = OneHotEncoder(handle_unknown='ignore')
        train_category_matrix = [[element] for element in train_df[feature]]
        test_category_matrix = [[element] for element in test_df[feature]]

        encoder.fit(train_category_matrix)
        train_df_hot_code = pd.DataFrame(encoder.transform(train_category_matrix).toarray())
        test_df_hot_code = pd.DataFrame(encoder.transform(test_category_matrix).toarray())

        train_df_hot_code.columns = [feature + '_' + str(c) for c in train_df_hot_code.columns]
        test_df_hot_code.columns = [feature + '_' + str(c) for c in test_df_hot_code.columns]


        # Replace the original feature with one-hot encoded feature
        train_df.drop(columns=feature, inplace=True)
        train_df = pd.concat([train_df, train_df_hot_code], axis=1)
        test_df.drop(columns=feature, inplace=True)
        test_df = pd.concat([test_df, test_df_hot_code], axis=1)


    return train_df, test_df

In [3]:
def preprocessing(data_folder):
  df_train = pd.read_csv(os.path.join(data_folder, "train.csv"))
  df_test_x = pd.read_csv(os.path.join(data_folder, "test.csv"))
  df_test_y = pd.read_csv(os.path.join(data_folder, "test_y.csv"))

  df_train_y = pd.DataFrame()
  df_train_y['Sale_Price'] = df_train['Sale_Price'].copy()
  df_train_x = df_train.drop(columns=['PID', 'Sale_Price'])
  df_test_x.drop(columns=['PID'], inplace=True)
  df_test_y.drop(columns=['PID'], inplace=True)

  # Impute missing values in "Garage_Yr_Blt" variable with 0
  df_train_x['Garage_Yr_Blt'].fillna(0, inplace=True)
  df_test_x['Garage_Yr_Blt'].fillna(0, inplace=True)

  # The following features are either highly imbalanced or not informative
  remove_features_set = ['Condition_2', 'Heating', 'Latitude', 'Longitude', 'Low_Qual_Fin_SF',
                        'Misc_Feature','Pool_Area','Pool_QC','Roof_Matl','Street','Utilities']
  df_train_x.drop(columns=remove_features_set, inplace=True)
  df_test_x.drop(columns=remove_features_set, inplace=True)

  winsor_features_set = ['BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Enclosed_Porch', 'First_Flr_SF',
                  'Garage_Area', 'Gr_Liv_Area', 'Lot_Area', 'Lot_Frontage','Mas_Vnr_Area',
                  'Misc_Val', 'Open_Porch_SF', 'Screen_Porch', 'Second_Flr_SF', 'Three_season_porch',
                  'Total_Bsmt_SF', 'Wood_Deck_SF']
  for val in winsor_features_set:
    upper_limit1 = df_train[val].quantile(0.974)
    df_train_x[val] = df_train_x[val].apply(lambda x: upper_limit1 if x > upper_limit1 else x)
    upper_limit2 = df_test_x[val].quantile(0.974)
    df_test_x[val] = df_test_x[val].apply(lambda x: upper_limit2 if x > upper_limit2 else x)

  # Treating the two features "Mo_Sold" (1~12), and "Year_Sold" (2006~2010) as categorical
  # variables can improve model performance
  df_train_x['Mo_Sold'] = df_train_x['Mo_Sold'].values.astype('object')
  df_test_x['Mo_Sold'] = df_test_x['Mo_Sold'].values.astype('object')
  df_train_x['Year_Sold'] = df_train_x['Year_Sold'].values.astype('object')
  df_test_x['Year_Sold'] = df_test_x['Year_Sold'].values.astype('object')

  df_train_x_trans, df_test_x_trans =  categorical_variable_transform(df_train_x, df_test_x)

  # Log-scale sale_price
  df_train_y['Sale_Price'] =  df_train_y['Sale_Price'].apply(lambda y: np.log(y))
  df_test_y['Sale_Price'] =  df_test_y['Sale_Price'].apply(lambda y: np.log(y))

  return df_train_x_trans, df_test_x_trans, df_train_y, df_test_y

In [4]:
best_alpha = 0.0026048905108264305

In [5]:
def lasso_rmse(df_train_x_trans, df_test_x_trans, df_train_y, df_test_y, best_alpha):
  best_lasso = Lasso(fit_intercept=True, random_state=0, alpha=best_alpha, max_iter=10000)
  best_lasso_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", best_lasso)])

  best_lasso_pipeline.fit(df_train_x_trans , df_train_y)

  lasso_train_predict = best_lasso_pipeline.predict(df_train_x_trans)
  lasso_test_predict = best_lasso_pipeline.predict(df_test_x_trans)

  lasso_rmse_train = np.sqrt(np.mean((lasso_train_predict - np.squeeze(df_train_y.values))**2))
  lasso_rmse_test = np.sqrt(np.mean((lasso_test_predict - np.squeeze(df_test_y.values))**2))
  return lasso_rmse_test

In [6]:
max_depth = 2
n_estimators = 420

In [7]:
def XGBoost_rmse(df_train_x_trans, df_test_x_trans, df_train_y, df_test_y, max_depth, n_estimators):
  best_xgb = XGBRegressor(max_depth=max_depth, n_estimators=400)
  best_xgb_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("xgb", best_xgb)])

  best_xgb_pipeline.fit(df_train_x_trans , df_train_y)
  xgb_train_predict = best_xgb_pipeline.predict(df_train_x_trans)
  xgb_test_predict = best_xgb_pipeline.predict(df_test_x_trans)
  xgb_rmse_train = np.sqrt(np.mean((xgb_train_predict - np.squeeze(df_train_y.values))**2))
  xgb_rmse_test = np.sqrt(np.mean((xgb_test_predict - np.squeeze(df_test_y.values))**2))
  return xgb_rmse_test


In [8]:
import time
root = '/Users/richardxu/Dropbox/UIUC_CS598_Statistical_Learning/Project1/proj1/fold'
lasso_rmse_list = []
tree_rmse_list = []
start_time = time.time()
for i in range(1, 11):
  df_train_x_trans, df_test_x_trans, df_train_y, df_test_y = preprocessing(root+str(i))
  lasso_rmse_list.append(lasso_rmse(df_train_x_trans, df_test_x_trans, df_train_y, df_test_y, best_alpha))
  tree_rmse_list.append(XGBoost_rmse(df_train_x_trans, df_test_x_trans, df_train_y, df_test_y, max_depth, n_estimators))
end_time = time.time()
print(end_time - start_time)

13.214696884155273


In [9]:
for rmse in lasso_rmse_list:
  print("%.4f" % rmse)
print()
for rmse in tree_rmse_list:
  print("%.4f" % rmse)

0.1240
0.1176
0.1222
0.1296
0.1124
0.1338
0.1269
0.1203
0.1304
0.1235

0.1163
0.1226
0.1187
0.1242
0.1181
0.1356
0.1325
0.1288
0.1317
0.1321
