In [1]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from scipy.stats import skew, norm

**References**
> https://www.kaggle.com/code/bextuychiev/lasso-regression-with-pipelines-tutorial

> https://www.kaggle.com/code/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv

In [2]:
prject_folder = "/Users/richardxu/Dropbox/UIUC_CS598_Statistical_Learning/Project1/proj1/fold4"
df_train = pd.read_csv(os.path.join(prject_folder, "train.csv"))
df_test_x = pd.read_csv(os.path.join(prject_folder, "test.csv"))
df_test_y = pd.read_csv(os.path.join(prject_folder, "test_y.csv"))

In [3]:
# categorical_features = {'Street' : {'Grvl' : 1, 'Pave' : 2},
#                        'Alley' : {'Gravel' : 1, 'Paved' : 2, 'No_Alley_Access':3},
#                        'Lot_Shape' : {'Irregular' : 1, 'Moderately_Irregular' : 2, 'Slightly_Irregular' : 2, 'Regular' : 3},
#                        'Land_Contour': {'Low' : 1, 'HLS' : 2, 'Bnk' : 2, 'Lvl' : 3},
#                        'Land_Slope' : {'Sev' : 1, 'Mod' : 2, 'Gtl' : 3},
#                        'Bsmt_Exposure' : {'No' : 1, 'Mn' : 2, 'Av': 3, 'Gd' : 4, 'No_Basement':5},
#                        'BsmtFin_Type_1' : {'Unf' : 1, 'LwQ': 2, 'Rec' : 3, 'BLQ' : 4, 
#                                          'ALQ' : 5, 'GLQ' : 6, 'No_Basement':7},
#                        'BsmtFin_Type_2' : {'Unf' : 1, 'LwQ': 2, 'Rec' : 3, 'BLQ' : 4, 
#                                          'ALQ' : 5, 'GLQ' : 6, 'No_Basement':7},
#                        'Functional' : {'Sal' : 1, 'Sev' : 2, 'Maj2' : 3, 'Maj1' : 4, 'Mod': 5, 
#                                        'Min2' : 6, 'Min1' : 7, 'Typ' : 8},
#                        'Garage_Finish' : {'Unf' : 1, 'RFn' : 2, 'Fin' : 3, 'No_Garage':4},
#                        'Paved_Drive' : {'Paved' : 1, 'Dirt_Gravel' : 2, 'Partial_Pavement' : 3},
#                        'Fence' : {'Good_Privacy' : 1, 'Good_Wood' : 1, 'Minimum_Wood_Wire' : 2, 'Minimum_Privacy':2 ,'No_Fence': 3},
#                        'Sale_Condition' : {'Normal' : 2, 'Abnorml' : 1, 'AdjLand' : 1, 'Alloca' : 1, 'Family' : 1, 'Partial' : 1}
#                       }



# #df_train = df_train.replace(categorical_features).astype('object')
# #df_test_x = df_test_x.replace(categorical_features).astype('object')
# df_train = df_train.replace(categorical_features)
# df_test_x = df_test_x.replace(categorical_features)
# for key in categorical_features.keys():
#     df_train[key] = df_train[key].astype('object')
#     df_test_x[key] = df_test_x[key].astype('object')

In [4]:
df_train_y = pd.DataFrame()
df_train_y['Sale_Price'] = df_train['Sale_Price'].copy()
df_train_x = df_train.drop(columns=['PID', 'Sale_Price'])
df_test_x.drop(columns=['PID'], inplace=True)
df_test_y.drop(columns=['PID'], inplace=True)

### Step 1. Missing value imputation

In [5]:
# Impute missing values in "Garage_Yr_Blt" variable with 0
df_train_x['Garage_Yr_Blt'].fillna(0, inplace=True)
df_test_x['Garage_Yr_Blt'].fillna(0, inplace=True)

### Step 2. Features to Remove

In [6]:
# The following features are either highly imbalanced or not informative
remove_features_set = ['Condition_2', 'Heating', 'Latitude', 'Longitude', 'Low_Qual_Fin_SF',
                      'Misc_Feature','Pool_Area','Pool_QC','Roof_Matl','Street','Utilities']


df_train_x.drop(columns=remove_features_set, inplace=True)
df_test_x.drop(columns=remove_features_set, inplace=True)

### Step 3. Winsorization

In [7]:
winsor_features_set = ['BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Enclosed_Porch', 'First_Flr_SF',
                 'Garage_Area', 'Gr_Liv_Area', 'Lot_Area', 'Lot_Frontage','Mas_Vnr_Area',
                 'Misc_Val', 'Open_Porch_SF', 'Screen_Porch', 'Second_Flr_SF', 'Three_season_porch',
                 'Total_Bsmt_SF', 'Wood_Deck_SF']

In [8]:
for val in winsor_features_set:
    upper_limit = df_train[val].quantile(0.98)
    df_train_x[val] = df_train_x[val].apply(lambda x: upper_limit if x > upper_limit else x)
    df_test_x[val] = df_test_x[val].apply(lambda x: upper_limit if x > upper_limit else x)

### Step 4. Categorical feature transformation using one-hot encoder

In [9]:
# Treating the two features "Mo_Sold" (1~12), and "Year_Sold" (2006~2010) as categorical
# variables can improve model performance


df_train_x['Mo_Sold'] = df_train_x['Mo_Sold'].values.astype('object')
df_test_x['Mo_Sold'] = df_test_x['Mo_Sold'].values.astype('object')


df_train_x['Year_Sold'] = df_train_x['Year_Sold'].values.astype('object')
df_test_x['Year_Sold'] = df_test_x['Year_Sold'].values.astype('object')




#Observe SalePrice by Neighborhood and group this column into numeric

#df_train_x.groupby('Neighborhood')['Sale_Price'].mean().sort_values()
#Map Neighborhoods

neighbor_map = {
        'Meadow_Village':0, 
        'Iowa_DOT_and_Rail_Road': 0,
        'Briardale': 0,
        'Brookside':1,
        'Old_Town':1,
        'Edwards':1,
        'Sawyer':1,
        'Landmark':1,
        'Blueste':1,
        'South_and_West_of_Iowa_State_University':1,
        'Northpark_Villa':1,
        'North_Ames':1,
        'Mitchell':1,
        'Sawyer_West':2,
        'Northwest_Ames':2,
        'Gilbert':2,
        'Bloomington_Heights':2,
        'Greens':2,
        'Crawford':3,
        'Clear_Creek':3,
        'College_Creek':3,
        'Somerset':3,
        'Green_Hills':3,
        'Veenker':3,
        'Timberland':3,
        'Stone_Brook':4,
        'Northridge_Heights':4,
        'Northridge':4
    }

df_train_x['Neighborhood'] = df_train_x['Neighborhood'].map(neighbor_map).astype('object')
df_test_x['Neighborhood'] = df_test_x['Neighborhood'].map(neighbor_map).astype('object')

In [10]:
def categorical_variable_transform(train_df, test_df):
    # IMPORTANT:
    # The test_dataframe needs to use the encoder from the trainng_dataframe, because some categories might be
    # missing in the test data
    
    categorical_feature_set = [feature for feature in train_df.columns if train_df[feature].dtypes=='object']

    for feature in categorical_feature_set:
        encoder = OneHotEncoder(handle_unknown='ignore')
        train_category_matrix = [[element] for element in train_df[feature]]
        test_category_matrix = [[element] for element in test_df[feature]]

        encoder.fit(train_category_matrix)
        train_df_hot_code = pd.DataFrame(encoder.transform(train_category_matrix).toarray())
        test_df_hot_code = pd.DataFrame(encoder.transform(test_category_matrix).toarray())

        train_df_hot_code.columns = [feature + '_' + str(c) for c in train_df_hot_code.columns]
        test_df_hot_code.columns = [feature + '_' + str(c) for c in test_df_hot_code.columns]


        # Replace the original feature with one-hot encoded feature
        train_df.drop(columns=feature, inplace=True)
        train_df = pd.concat([train_df, train_df_hot_code], axis=1)
        test_df.drop(columns=feature, inplace=True)
        test_df = pd.concat([test_df, test_df_hot_code], axis=1)


    return train_df, test_df

In [11]:
df_train_x_trans, df_test_x_trans =  categorical_variable_transform(df_train_x, df_test_x)

### Build Models

In [12]:
#df_train_x_trans['']

In [13]:
# Log-scale sale_price
df_train_y['Sale_Price'] =  df_train_y['Sale_Price'].apply(lambda y: np.log(y))
df_test_y['Sale_Price'] =  df_test_y['Sale_Price'].apply(lambda y: np.log(y))

### Elastic Net

In [14]:
# elastic = ElasticNet(fit_intercept=True, max_iter=10000)
# elastic_cv_pipeline = Pipeline(steps=[("scalar",Normalizer()), ("elastic", elastic)])
# alphas = np.logspace(-6, 0.1, 20)
# param_grid = {"elastic__alpha":alphas}
# n_folds = 10
# elastic_clf = GridSearchCV(elastic_cv_pipeline, param_grid, cv=n_folds, refit=False, scoring='neg_mean_squared_error')

# elastic_clf.fit(df_train_x_trans , df_train_y)
# best_alpha = elastic_clf.best_params_['elastic__alpha']
# print(best_alpha)

# best_elastic = ElasticNet(fit_intercept=True, random_state=0, alpha=best_alpha, max_iter=10000)
# best_elastic_pipeline = Pipeline(steps=[("scalar",Normalizer()),("elastic", best_elastic)])

# best_elastic_pipeline.fit(df_train_x_trans , df_train_y)

# elastic_test_predict = best_elastic_pipeline.predict(df_test_x_trans)

# elastic_rmse_test =np.sqrt(np.mean((elastic_test_predict - np.squeeze(df_test_y.values))**2))
# elastic_rmse_test

### Lasso Linear Regression

In [15]:
# lasso = Lasso(fit_intercept=True, max_iter=10000)
# lasso_cv_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", lasso)])

# alphas = np.linspace(0.001, 0.005, 5)
# param_grid = {"lasso__alpha":alphas}
# n_folds = 10
# lasso_clf = GridSearchCV(lasso_cv_pipeline, param_grid, cv=n_folds, refit=False, scoring='neg_mean_squared_error')

In [16]:
#lasso_clf.fit(df_train_x_trans , df_train_y)


In [17]:
# best_alpha = lasso_clf.best_params_['lasso__alpha']
# print(best_alpha)

In [18]:
best_alpha = 0.0026
best_lasso = Lasso(alpha=best_alpha, max_iter=10000)
best_lasso_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", best_lasso)])

best_lasso_pipeline.fit(df_train_x_trans , df_train_y)

In [19]:
lasso_train_predict = best_lasso_pipeline.predict(df_train_x_trans)
lasso_test_predict = best_lasso_pipeline.predict(df_test_x_trans)

In [20]:
lasso_rmse_train = np.sqrt(np.mean((lasso_train_predict - np.squeeze(df_train_y.values))**2))
lasso_rmse_test =np.sqrt(np.mean((lasso_test_predict - np.squeeze(df_test_y.values))**2))
lasso_rmse_test

0.13174114804155168

In [21]:
print("Lasso Linear Regression: Root mean squared error of train result is {}".format(lasso_rmse_train))
print("Lasso Linear Regression: Root mean squared error of test result is {}".format(lasso_rmse_test))

Lasso Linear Regression: Root mean squared error of train result is 0.10608944012756337
Lasso Linear Regression: Root mean squared error of test result is 0.13174114804155168


### XGBoost model

In [22]:
# xgb = XGBRegressor()
# xgb_cv_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("xgb", xgb)])
# param_grid = {"xgb__max_depth":[2, 3, 4, 5, 6],
#               "xgb__n_estimators":[200, 400, 600, 800, 1000]}

# n_folds = 10
# xgb_clf = GridSearchCV(xgb_cv_pipeline, param_grid, cv=n_folds, scoring='neg_mean_squared_error')
# xgb_clf.fit(df_train_x_trans , df_train_y)

In [23]:
# best_params_xgb = xgb_clf.best_params_
# print(best_params_xgb)

In [24]:
best_xgb = XGBRegressor(max_depth=2, n_estimators=400)
best_xgb_pipeline = Pipeline(steps=[("scalar",StandardScaler()), ("xgb", best_xgb)])

best_xgb_pipeline.fit(df_train_x_trans , df_train_y)

In [25]:
xgb_train_predict = best_xgb_pipeline.predict(df_train_x_trans)
xgb_test_predict = best_xgb_pipeline.predict(df_test_x_trans)

In [26]:
xgb_rmse_train = np.sqrt(np.mean((xgb_train_predict - np.squeeze(df_train_y.values))**2))
xgb_rmse_test = np.sqrt(np.mean((xgb_test_predict - np.squeeze(df_test_y.values))**2))

In [27]:
print("XGBoost: Root mean squared error of train result is {}".format(xgb_rmse_train))
print("XGBoost: Root mean squared error of test result is {}".format(xgb_rmse_test))

XGBoost: Root mean squared error of train result is 0.06159460836806481
XGBoost: Root mean squared error of test result is 0.13036797427448082
