In [1]:
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV, cross_val_score)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from scipy import stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# constants and helper methods

CONDITIONS_DICT = {"NA": 0, "NaN": 0, "nan": 0, "Po": 2, "Fa": 3, "TA": 4, "Gd":6, "Ex": 10}

# constants
CATEGORY_LABELS = {"KitchenQual":       CONDITIONS_DICT,
                    "GarageCond":       CONDITIONS_DICT,
                    "GarageQual":       CONDITIONS_DICT,
                    "ExterQual":        CONDITIONS_DICT,
                    "ExterCond":        CONDITIONS_DICT,
                    "BsmtQual":         CONDITIONS_DICT,
                    "BsmtCond":         CONDITIONS_DICT,
                    "FireplaceQu" :     CONDITIONS_DICT,
                    "HeatingQC" :       CONDITIONS_DICT,
                    "LotConfig":     {"Inside": 0, "Corner": 6, "CulDSac": 10, "FR2": 3, "FR3":4},
                    "Utilities":     {"ELO": 0, "NoSeWa": 1, "NoSewr": 2, "AllPub": 3},
                    "LandSlope":     {"Gtl": 10, "Mod": 4, "Sev": 1},
                    "LotShape":     {"Reg": 10, "IR1": 5, "IR2": 3, "IR3": 1},
                    "GarageType":     {"NA": 0, "nan": 0, "Basment": 4,  "Detchd": 1, "CarPort": 3, "BuiltIn": 5, "Attchd": 7, "2Types": 12},
                    "BldgType":     {"TwnhsI": 1, "Twnhs": 2, "TwnhsE": 3, "Duplex": 5,  "2fmCon": 7, "1Fam": 12},
                    "CentralAir":     {"N": 1, "Y": 10},
                    "Electrical":     {"Mix": 1, "FuseP": 3, "FuseF": 5,  "FuseA": 7, "SBrkr": 12},
                    "MSZoning":     {"RL": 100, "RM": 60, "C (all)": 20, "FV": 30, "RH": 30},
                    "LandContour":     {"Lvl": 100, "Low": 15, "Bnk": 25, "HLS": 5},
                    "Fence":     {"NA": 0, "MnPrv": 25, "MnWw": 15, "GdWo": 40, 'GdPrv': 100},
                    "Functional":     {"Typ": 100, "Min1": 70, "Min2": 50, "Mod": 40, "Maj1": 25, "Maj2": 20, "Min2": 10, "Sev": 5, "Sal": 1},
                    "MiscFeature":     {"NA": 0, "Shed": 30, "Gar2": 40, "Othr": 25, "TenC": 100},
                    "PavedDrive":     {"Y": 100, "P": 30, "N": 0},
                    }

CAT_COLS_TO_IGNORE = ["Functional",
                        "MiscFeature",
                        "Electrical",
                        "Fence",
                        "FireplaceQu",
                        "HeatingQC"           
                    ]

CAT_COLS = [ x for x in CATEGORY_LABELS.keys() if x not in CAT_COLS_TO_IGNORE]
print(CAT_COLS)

# plot correlations
def plotCoorelations(df):
    # remove non_numeric features  
    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_IsolationForest(X, y, outlierFraction = 0.02):
    clf = IsolationForest( behaviour = 'new', contamination = outlierFraction)
    preds = clf.fit_predict(X)
    outliers = np.where(preds == -1)
    return dropOutliers(X, y, outliers)

def dropOutliers(X, y, outliers):
    print("number of outliers = {0}".format(len(outliers[0])))
    # drop outliers
    X_clean = np.delete(X, outliers[0], axis = 0)
    y_clean = np.delete(y.values, outliers[0])
    return X_clean, y_clean, outliers[0] 

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_ZScore(X, y, zValue = 3, bypass=False):
    if bypass:
        return X, y, []
    z = np.abs(stats.zscore(X))
    outliers = np.where(z > zValue)
    return dropOutliers(X, y, outliers)

def getTransformedColumnNames(ct):
    for item in ct.named_transformers_:
       pipeline = ct.named_transformers_[item]
       for step in pipeline.named_steps:
           t1 = pipeline.named_steps[step]
           print(t1.get_feature_names())
            
  
def displayScoresExp1p(scores):
    print("Scores:", np.expm1(scores))
    print("Mean:", np.expm1(scores.mean()))
    print("standard deviation:", np.expm1(scores.std()))
    
def displayScores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("standard deviation:", scores.std())


['KitchenQual', 'GarageCond', 'GarageQual', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'LotConfig', 'Utilities', 'LandSlope', 'LotShape', 'GarageType', 'BldgType', 'CentralAir', 'MSZoning', 'LandContour', 'PavedDrive']


In [3]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)



In [4]:
# drop outliers
home_data = home_data.drop(home_data['LotFrontage']
                                     [home_data['LotFrontage']>200].index)
home_data = home_data.drop(home_data['LotArea']
                                     [home_data['LotArea']>100000].index)
home_data = home_data.drop(home_data['BsmtFinSF1']
                                     [home_data['BsmtFinSF1']>4000].index)
home_data = home_data.drop(home_data['TotalBsmtSF']
                                     [home_data['TotalBsmtSF']>6000].index)
home_data = home_data.drop(home_data['1stFlrSF']
                                     [home_data['1stFlrSF']>4000].index)
home_data = home_data.drop(home_data.LowQualFinSF
                                     [home_data['LowQualFinSF']>550].index)

In [5]:
Y = home_data["SalePrice"]
X = home_data.drop(columns = ["Id", "SalePrice"])

In [6]:
# perform feature scaling for numerical features
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "OpenPorchSF"]

log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('logscaler', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())])

#numeric features that require a normal transformation
numeric_features_names = [x for x in num_features_names if x not in log_features_names]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

transformers=[
        ('log', log_pipeline, log_features_names),
        ('num', numeric_pipeline, numeric_features_names),  
    ]

# ensure that result is always a dense matrix
ct = ColumnTransformer(transformers=transformers, sparse_threshold = 0)
Xt = ct.fit_transform(X)

In [7]:
# remove outliers from numeric data
Xt, Y, outliers = outlierRemoval_ZScore(Xt, Y, 8, bypass=True)

In [8]:
# use PCA to fit and transform the data using a 0.95 variance
pca = PCA(0.99)
X_pca_t = pca.fit_transform(Xt)
print("Xt shape: ", Xt.shape)
print("X_pca_t shape: ", X_pca_t.shape)

Xt shape:  (1453, 36)
X_pca_t shape:  (1453, 32)


In [9]:
categorical_features_names = X.select_dtypes(include=object).columns
#categorical_features_names = CAT_COLS

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

X_cat_t = cat_pipeline.fit_transform(X[categorical_features_names])

# remove corresponding outlier rows from the categorical data
X_cat_t2 = np.delete(X_cat_t.todense(), outliers, axis=0)


In [10]:
# concatenate the numerical and one hot encoded categorical data
Xt = np.concatenate((X_pca_t, X_cat_t2 ), axis = 1)
print("concatenated Xt shape: ", Xt.shape)

concatenated Xt shape:  (1453, 299)


In [None]:
# use random forest regressor 

clf = RandomForestRegressor(random_state=1, n_estimators = 100, criterion="mae", n_jobs=-1)
print(Xt.shape)
print(Y.shape)

# fit the log of the sale price
start = time.time()
scores = cross_val_score(clf, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))


In [None]:
displayScores(-scores)


In [11]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 0.1, solver="auto")
# fit the log of the sale price
start = time.time()
scores = cross_val_score(ridge_reg, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


cross_val_score took 2.2111661434173584 seconds
Scores: [16465.16959213 18225.30628047 18255.28813667 19827.58313817
 20477.93474478 19864.05317795 16561.17837685 16306.64426933
 20864.92408848 14854.21868569]
Mean: 18170.230049052312
standard deviation: 1953.9233101178902


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.0s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.1s finished


In [12]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.1)
# fit the log of the sale price
start = time.time()
scores = cross_val_score(lasso_reg, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.7s remaining:    1.8s


cross_val_score took 3.8688018321990967 seconds
Scores: [16762.57302271 18472.89073683 18530.10763561 19875.2405406
 20586.2274657  20323.22356883 16506.48917108 16346.20203232
 20964.05727597 18958.08199806]
Mean: 18732.5093447715
standard deviation: 1641.3961185873336


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.8s finished


In [13]:
from sklearn.linear_model import ElasticNet
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
# fit the log of the sale price
start = time.time()
scores = cross_val_score(elastic_net, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


cross_val_score took 0.652238130569458 seconds
Scores: [16172.94900749 16173.17659262 16675.81966436 18876.69768604
 18075.2008204  15207.34116078 15007.51638548 14521.20329742
 18972.85280259 15387.28135757]
Mean: 16507.003877475196
standard deviation: 1534.32075469751


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished


In [14]:
# elastic net seems to give the best scores. use a gridsearchCV to find the best elasticNet parameters
param_grid = [
    {'alpha' : [0.2, 0.5, 0.7,  1.0], 'l1_ratio': [ 0.1,0.4,  0.5, 0.7, 0.8, 0.9]}
]

grid_search = GridSearchCV(elastic_net, param_grid, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 10, cv = 10)
grid_search.fit(Xt, Y)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1672s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    8.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=ElasticNet(alpha=0.2, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.9, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [0.2, 0.5, 0.7, 1.0],
                          'l1_ratio': [0.1, 0.4, 0.5, 0.7, 0.8, 0.9]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=10)

In [15]:
grid_search.best_params_


{'alpha': 0.2, 'l1_ratio': 0.9}

In [16]:
# use the grid serach results to create the final predictor for the test
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
elastic_net.fit(Xt,Y)

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

X_test = test_data.drop(columns = ["Id"])

# numerical columns
X_num_test = ct.fit_transform(X_test)
# PCA on the numerical data
X_pca_test = pca.transform(X_num_test)
# categorical colums that are OHE
X_cat_test = cat_pipeline.transform(X_test[categorical_features_names])

X_final_test = np.concatenate((X_pca_test, X_cat_test.todense() ), axis = 1)
#make predictions which we will submit. 
y_pred = elastic_net.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred})
output.to_csv('../data/submission.csv', index=False)

In [17]:
# use the grid serach results to create the final predictor for the test
elastic_net= clf = RandomForestRegressor(random_state=1, n_estimators = 500, criterion="mae", n_jobs=-1)
elastic_net.fit(Xt,Y)

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

X_test = test_data.drop(columns = ["Id"])

# numerical columns
X_num_test = ct.fit_transform(X_test)
# PCA on the numerical data
X_pca_test = pca.transform(X_num_test)
# categorical colums that are OHE
X_cat_test = cat_pipeline.transform(X_test[categorical_features_names])

X_final_test = np.concatenate((X_pca_test, X_cat_test.todense() ), axis = 1)
#make predictions which we will submit. 
y_pred = elastic_net.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred})
output.to_csv('../data/submission.csv', index=False)

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

est=GradientBoostingRegressor(n_estimators=400, max_depth=5, loss='ls',min_samples_split=2,learning_rate=0.1, verbose = 5).fit(Xt, np.log(Y))
y_pred = est.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': np.exp(y_pred)})
output.to_csv('../data/submission.csv', index=False)

      Iter       Train Loss   Remaining Time 
         1           0.1318           20.20s
         2           0.1101           18.95s
         3           0.0924           18.84s
         4           0.0777           19.00s
         5           0.0658           21.12s
         6           0.0560           20.62s
         7           0.0479           20.13s
         8           0.0412           19.78s
         9           0.0356           19.46s
        10           0.0310           20.04s
        11           0.0271           19.83s
        12           0.0239           19.59s
        13           0.0212           19.35s
        14           0.0189           19.55s
        15           0.0169           20.00s
        16           0.0153           19.77s
        17           0.0139           19.61s
        18           0.0127           19.48s
        19           0.0116           19.77s
        20           0.0108           19.57s
        21           0.0100           19.40s
        2

       183           0.0004            9.22s
       184           0.0004            9.23s
       185           0.0004            9.20s
       186           0.0004            9.20s
       187           0.0004            9.16s
       188           0.0004            9.11s
       189           0.0004            9.07s
       190           0.0003            9.03s
       191           0.0003            8.98s
       192           0.0003            8.96s
       193           0.0003            8.91s
       194           0.0003            8.86s
       195           0.0003            8.81s
       196           0.0003            8.79s
       197           0.0003            8.75s
       198           0.0003            8.70s
       199           0.0003            8.64s
       200           0.0003            8.61s
       201           0.0003            8.60s
       202           0.0003            8.57s
       203           0.0003            8.54s
       204           0.0003            8.48s
       205

       368           0.0000            1.46s
       369           0.0000            1.42s
       370           0.0000            1.37s
       371           0.0000            1.33s
       372           0.0000            1.28s
       373           0.0000            1.24s
       374           0.0000            1.20s
       375           0.0000            1.15s
       376           0.0000            1.11s
       377           0.0000            1.06s
       378           0.0000            1.02s
       379           0.0000            0.97s
       380           0.0000            0.93s
       381           0.0000            0.89s
       382           0.0000            0.84s
       383           0.0000            0.79s
       384           0.0000            0.75s
       385           0.0000            0.70s
       386           0.0000            0.66s
       387           0.0000            0.61s
       388           0.0000            0.56s
       389           0.0000            0.52s
       390