In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
prject_folder = "/Users/richardxu/Dropbox/UIUC_CS598_Statistical_Learning/Project1/proj1"
df_train1 = pd.read_csv(os.path.join(prject_folder, "fold1/train.csv"))
df_test1_x = pd.read_csv(os.path.join(prject_folder, "fold1/test.csv"))
df_test1_y = pd.read_csv(os.path.join(prject_folder, "fold1/test_y.csv"))

In [3]:
df_train1_y = pd.DataFrame()
df_train1_y['Sale_Price'] = df_train1['Sale_Price'].copy()
df_train1_x = df_train1.drop(columns=['PID', 'Sale_Price'])
df_test1_x.drop(columns=['PID'], inplace=True)
df_test1_y.drop(columns=['PID'], inplace=True)

### Step 1. Missing value imputation

In [4]:
# Impute missing values in "Garage_Yr_Blt" variable with 0
df_train1_x['Garage_Yr_Blt'].fillna(0, inplace=True)
df_test1_x['Garage_Yr_Blt'].fillna(0, inplace=True)

### Step 2. Features to Remove

In [5]:
# The following features are either highly imbalanced or not informative
remove_features_set = ['Condition_2', 'Heating', 'Latitude', 'Longitude', 'Low_Qual_Fin_SF',
                       'Misc_Feature','Pool_Area','Pool_QC','Roof_Matl','Street','Utilities']
df_train1_x.drop(columns=remove_features_set, inplace=True)
df_test1_x.drop(columns=remove_features_set, inplace=True)

### Step 3. Winsorization

In [6]:
winsor_features_set = ['BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Enclosed_Porch', 'First_Flr_SF',
                 'Garage_Area', 'Gr_Liv_Area', 'Lot_Area', 'Lot_Frontage','Mas_Vnr_Area',
                 'Misc_Val', 'Open_Porch_SF', 'Screen_Porch', 'Second_Flr_SF', 'Three_season_porch',
                 'Total_Bsmt_SF', 'Wood_Deck_SF']

In [7]:
for val in winsor_features_set:
    upper_limit = df_train1[val].quantile(0.95)
    df_train1_x[val] = df_train1_x[val].apply(lambda x: upper_limit if x > upper_limit else x)

### Step 4. Categorical feature transformation using one-hot encoder

In [8]:
categorical_feature_set_test = [val for val in df_test1_x.columns if df_test1_x[val].dtypes=='object']
print(len(categorical_feature_set_test))

39


In [9]:
def categorical_variable_transform(train_df, test_df):
    # IMPORTANT:
    # The test_dataframe needs to use the encoder from the trainng_dataframe, because some categories might be
    # missing in the test data
    
    categorical_feature_set = [feature for feature in train_df.columns if train_df[feature].dtypes=='object']

    for feature in categorical_feature_set:
        encoder = OneHotEncoder(handle_unknown='ignore')
        train_category_matrix = [[element] for element in train_df[feature]]
        test_category_matrix = [[element] for element in test_df[feature]]

        encoder.fit(train_category_matrix)
        train_df_hot_code = pd.DataFrame(encoder.transform(train_category_matrix).toarray())
        test_df_hot_code = pd.DataFrame(encoder.transform(test_category_matrix).toarray())

        train_df_hot_code.columns = [feature + '_' + str(c) for c in train_df_hot_code.columns]
        test_df_hot_code.columns = [feature + '_' + str(c) for c in test_df_hot_code.columns]


        # Replace the original feature with one-hot encoded feature
        train_df.drop(columns=feature, inplace=True)
        train_df = pd.concat([train_df, train_df_hot_code], axis=1)
        test_df.drop(columns=feature, inplace=True)
        test_df = pd.concat([test_df, test_df_hot_code], axis=1)


    return train_df, test_df

In [10]:
df_train1_x_trans, df_test1_x_trans =  categorical_variable_transform(df_train1_x, df_test1_x)

In [11]:
df_train1_x_trans.columns

Index(['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add',
       'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF',
       'Total_Bsmt_SF', 'First_Flr_SF',
       ...
       'Sale_Type_6', 'Sale_Type_7', 'Sale_Type_8', 'Sale_Type_9',
       'Sale_Condition_0', 'Sale_Condition_1', 'Sale_Condition_2',
       'Sale_Condition_3', 'Sale_Condition_4', 'Sale_Condition_5'],
      dtype='object', length=308)

In [12]:
df_test1_x_trans.columns

Index(['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add',
       'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF',
       'Total_Bsmt_SF', 'First_Flr_SF',
       ...
       'Sale_Type_6', 'Sale_Type_7', 'Sale_Type_8', 'Sale_Type_9',
       'Sale_Condition_0', 'Sale_Condition_1', 'Sale_Condition_2',
       'Sale_Condition_3', 'Sale_Condition_4', 'Sale_Condition_5'],
      dtype='object', length=308)

### Step 5. Remove highly correlated features

In [None]:
# Create  correation matrix
df_train1_x_trans_corr_matrix= df_train1_x_trans.corr().abs()

In [None]:
# Select upper traingle of correlation matrix
upper = df_train1_x_trans_corr_matrix.where(np.triu(np.ones(df_train1_x_trans_corr_matrix.shape), k=1).astype(np.bool))

In [None]:
upper

In [None]:
# Find features with correlation greater than 0.9
correlated_columns = [column for column in upper.columns if any(upper[column]>0.9)]

In [None]:
sorted(correlated_columns)

In [None]:
# Drop Marked Features
df_train1_x_trans.drop(columns=correlated_columns, inplace=True)
df_test1_x_trans.drop(columns=correlated_columns, inplace=True)

In [None]:
df_train1_x_trans

In [None]:
df_test1_x_trans

### Step 6 Add some new features

In [None]:
np.unique(df_train1_x_trans['Year_Sold'])

### Build Models

In [13]:
df_train1_y['Sale_Price'] =  df_train1_y['Sale_Price'].apply(lambda y: np.log(y))
df_test1_y['Sale_Price'] =  df_test1_y['Sale_Price'].apply(lambda y: np.log(y))

In [14]:
df_train1_y

Unnamed: 0,Sale_Price
0,11.561716
1,12.055250
2,12.154253
3,12.183316
4,12.271392
...,...
2046,11.867097
2047,11.782953
2048,11.790557
2049,12.043554


In [15]:
df_test1_y

Unnamed: 0,Sale_Price
0,11.626254
1,11.911702
2,12.083905
3,11.982929
4,11.813030
...,...
874,12.190959
875,11.986049
876,12.676076
877,11.951180


In [16]:
lasso = Lasso(fit_intercept=True, random_state=0, max_iter=10000)
pipe = Pipeline(steps=[("scalar",StandardScaler()), ("lasso", lasso)])
alphas = np.logspace(-6, 0.1, 20)
param_grid = {"lasso__alpha":alphas}
n_folds = 5
clf = GridSearchCV(pipe, param_grid, cv=n_folds, refit=False, scoring='neg_mean_squared_error')

In [17]:
clf.fit(df_train1_x_trans , df_train1_y)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [18]:
best_alpha = clf.best_params_['lasso__alpha']
print(best_alpha)

0.0034007832065426507


In [19]:
model = Lasso(fit_intercept=True, alpha = 0.00005,max_iter=100000)
model.fit(df_train1_x_trans , df_train1_y)

In [21]:
test1_predict = model.predict(df_test1_x_trans)
test1_predict

array([11.20060558, 11.97145233, 12.1155181 , 11.91310677, 11.87038029,
       11.77011617, 11.78746753, 12.1496726 , 11.52945881, 12.71326895,
       11.88600259, 11.88831806, 12.73295174, 12.00971531, 12.27044649,
       11.84288762, 12.15970806, 11.59642849, 11.84922715, 11.82251274,
       12.51935472, 12.04234172, 12.21349105, 11.93713728, 11.29445182,
       11.27055291, 11.97948539, 11.81272682, 11.72916911, 12.33559712,
       11.4192188 , 11.73849904, 12.83831491, 12.2465275 , 11.83237188,
       11.88623488, 12.30578894, 11.72999457, 11.56525176, 12.47015159,
       12.01107129, 12.57265474, 12.52070866, 12.74324212, 12.19535479,
       11.52117595, 12.15368435, 11.77717418, 11.99542592, 11.58713311,
       12.12354834, 12.07006706, 11.44106587, 11.33103628, 12.98668476,
       12.31313207, 12.1784252 , 11.96268182, 12.20060585, 12.01920178,
       11.81625517, 12.25374687, 12.34983316, 11.78736109, 11.68352691,
       12.11516241, 11.48885126, 12.22873086, 11.74451328, 11.80

In [22]:
np.sum((test1_predict - df_test1_y.values)**2)

252527.5611132153

In [None]:
mse = np.mean((test1_predict - df_test1_y.values)**2)

In [None]:
mse

In [None]:
train_1_predict = model.predict(df_train1_x_trans)

In [None]:
mse_train = np.mean((train_1_predict - df_train1_y.values)**2)

In [None]:
mse_train