In [16]:
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV, cross_val_score)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from scipy import stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns


In [37]:
# constants and helper methods

CONDITIONS_DICT = {"NA": 0, "NaN": 0, "nan": 0, "Po": 2, "Fa": 3, "TA": 4, "Gd":6, "Ex": 10}

# constants
CATEGORY_LABELS = {"KitchenQual":       CONDITIONS_DICT,
                    "GarageCond":       CONDITIONS_DICT,
                    "GarageQual":       CONDITIONS_DICT,
                    "ExterQual":        CONDITIONS_DICT,
                    "ExterCond":        CONDITIONS_DICT,
                    "BsmtQual":         CONDITIONS_DICT,
                    "BsmtCond":         CONDITIONS_DICT,
                    "FireplaceQu" :     CONDITIONS_DICT,
                    "HeatingQC" :       CONDITIONS_DICT,
                    "LotConfig":     {"Inside": 0, "Corner": 6, "CulDSac": 10, "FR2": 3, "FR3":4},
                    "Utilities":     {"ELO": 0, "NoSeWa": 1, "NoSewr": 2, "AllPub": 3},
                    "LandSlope":     {"Gtl": 10, "Mod": 4, "Sev": 1},
                    "LotShape":     {"Reg": 10, "IR1": 5, "IR2": 3, "IR3": 1},
                    "GarageType":     {"NA": 0, "nan": 0, "Basment": 4,  "Detchd": 1, "CarPort": 3, "BuiltIn": 5, "Attchd": 7, "2Types": 12},
                    "BldgType":     {"TwnhsI": 1, "Twnhs": 2, "TwnhsE": 3, "Duplex": 5,  "2fmCon": 7, "1Fam": 12},
                    "CentralAir":     {"N": 1, "Y": 10},
                    "Electrical":     {"Mix": 1, "FuseP": 3, "FuseF": 5,  "FuseA": 7, "SBrkr": 12},
                    "MSZoning":     {"RL": 100, "RM": 60, "C (all)": 20, "FV": 30, "RH": 30},
                    "LandContour":     {"Lvl": 100, "Low": 15, "Bnk": 25, "HLS": 5},
                    "Fence":     {"NA": 0, "MnPrv": 25, "MnWw": 15, "GdWo": 40, 'GdPrv': 100},
                    "Functional":     {"Typ": 100, "Min1": 70, "Min2": 50, "Mod": 40, "Maj1": 25, "Maj2": 20, "Min2": 10, "Sev": 5, "Sal": 1},
                    "MiscFeature":     {"NA": 0, "Shed": 30, "Gar2": 40, "Othr": 25, "TenC": 100},
                    "PavedDrive":     {"Y": 100, "P": 30, "N": 0},
                    }

CAT_COLS_TO_IGNORE = ["Functional",
                        "MiscFeature",
                        "Electrical",
                        "Fence",
                        "FireplaceQu",
                        "HeatingQC"           
                    ]

CAT_COLS = [ x for x in CATEGORY_LABELS.keys() if x not in CAT_COLS_TO_IGNORE]
print(CAT_COLS)

# plot correlations
def plotCoorelations(df):
    # remove non_numeric features  
    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_IsolationForest(X, y, outlierFraction = 0.02):
    clf = IsolationForest( behaviour = 'new', contamination = outlierFraction)
    preds = clf.fit_predict(X)
    outliers = np.where(preds == -1)
    return dropOutliers(X, y, outliers)

def dropOutliers(X, y, outliers):
    print("number of outliers = {0}".format(len(outliers[0])))
    # drop outliers
    X_clean = np.delete(X, outliers[0], axis = 0)
    y_clean = np.delete(y.values, outliers[0])
    return X_clean, y_clean, outliers[0] 

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_ZScore(X, y, zValue = 3, bypass=False):
    if bypass:
        return X, y, []
    z = np.abs(stats.zscore(X))
    outliers = np.where(z > zValue)
    return dropOutliers(X, y, outliers)

def getTransformedColumnNames(ct):
    for item in ct.named_transformers_:
       pipeline = ct.named_transformers_[item]
       for step in pipeline.named_steps:
           t1 = pipeline.named_steps[step]
           print(t1.get_feature_names())
            
  
def displayScoresExp1p(scores):
    print("Scores:", np.expm1(scores))
    print("Mean:", np.expm1(scores.mean()))
    print("standard deviation:", np.expm1(scores.std()))
    
def displayScores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("standard deviation:", scores.std())


['KitchenQual', 'GarageCond', 'GarageQual', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'LotConfig', 'Utilities', 'LandSlope', 'LotShape', 'GarageType', 'BldgType', 'CentralAir', 'MSZoning', 'LandContour', 'PavedDrive']


In [50]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)
Y = home_data["SalePrice"]
X = home_data.drop(columns = ["Id", "SalePrice"])


In [51]:
# perform feature scaling for numerical features
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "OpenPorchSF"]

log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('logscaler', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())])

#numeric features that require a normal transformation
numeric_features_names = [x for x in num_features_names if x not in log_features_names]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

transformers=[
        ('log', log_pipeline, log_features_names),
        ('num', numeric_pipeline, numeric_features_names),  
    ]

# ensure that result is always a dense matrix
ct = ColumnTransformer(transformers=transformers, sparse_threshold = 0)
Xt = ct.fit_transform(X)

In [52]:
# remove outliers from numeric data
Xt, Y, outliers = outlierRemoval_ZScore(Xt, Y, 8, bypass=False)

number of outliers = 36


In [53]:
# use PCA to fit and transform the data using a 0.95 variance
pca = PCA(0.99)
X_pca_t = pca.fit_transform(Xt)
print("Xt shape: ", Xt.shape)
print("X_pca_t shape: ", X_pca_t.shape)

Xt shape:  (1427, 36)
X_pca_t shape:  (1427, 31)


In [54]:
categorical_features_names = X.select_dtypes(include=object).columns
#categorical_features_names = CAT_COLS

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

X_cat_t = cat_pipeline.fit_transform(X[categorical_features_names])

# remove corresponding outlier rows from the categorical data
X_cat_t2 = np.delete(X_cat_t.todense(), outliers, axis=0)


In [55]:
# concatenate the numerical and one hot encoded categorical data
Xt = np.concatenate((X_pca_t, X_cat_t2 ), axis = 1)
print("concatenated Xt shape: ", Xt.shape)

concatenated Xt shape:  (1427, 299)


In [None]:
# use random forest regressor 

clf = RandomForestRegressor(random_state=1, n_estimators = 100, criterion="mae", n_jobs=-1)
print(Xt.shape)
print(Y.shape)

# fit the log of the sale price
start = time.time()
scores = cross_val_score(clf, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))




In [None]:
displayScores(-scores)


In [56]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 0.1, solver="auto")
# fit the log of the sale price
start = time.time()
scores = cross_val_score(ridge_reg, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.2s remaining:    0.1s


cross_val_score took 0.41911959648132324 seconds
Scores: [16535.30262927 16230.0643879  18073.45307663 19802.65871948
 19264.47623081 17784.61381912 16858.0509664  15379.02919795
 18532.69324927 14167.95265616]
Mean: 17262.82949329868
standard deviation: 1669.9088760063983


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


In [57]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.1)
# fit the log of the sale price
start = time.time()
scores = cross_val_score(lasso_reg, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.0s remaining:    2.0s


cross_val_score took 4.740268230438232 seconds
Scores: [16718.07698855 16459.41663437 18243.65270362 19851.94331315
 19535.52586534 18200.39398637 16802.95985102 15550.72750226
 18529.8058102  14084.99494838]
Mean: 17397.74976032493
standard deviation: 1712.2069952847648


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.6s finished


In [58]:
from sklearn.linear_model import ElasticNet
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
# fit the log of the sale price
start = time.time()
scores = cross_val_score(elastic_net, Xt, Y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = 10)
end = time.time()
elapsed_time = end - start
print("cross_val_score took {0} seconds".format(elapsed_time))
displayScores(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


cross_val_score took 0.7573914527893066 seconds
Scores: [15723.11342325 15413.91786181 16277.94231296 18477.75985012
 17725.58820143 14219.5079973  14923.36250838 13817.21215037
 17115.50499708 14805.20570642]
Mean: 15849.91150091082
standard deviation: 1454.4703839414574


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [59]:
# elastic net seems to give the best scores. use a gridsearchCV to find the best elasticNet parameters
param_grid = [
    {'alpha' : [0.2, 0.5, 0.7,  1.0], 'l1_ratio': [ 0.1,0.4,  0.5, 0.7, 0.8, 0.9]}
]

grid_search = GridSearchCV(elastic_net, param_grid, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 10, cv = 10)
grid_search.fit(Xt, Y)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=ElasticNet(alpha=0.2, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.9, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [0.2, 0.5, 0.7, 1.0],
                          'l1_ratio': [0.1, 0.4, 0.5, 0.7, 0.8, 0.9]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=10)

In [60]:
grid_search.best_params_


{'alpha': 0.2, 'l1_ratio': 0.9}

In [62]:
# use the grid serach results to create the final predictor for the test
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
elastic_net.fit(Xt,Y)

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

X_test = test_data.drop(columns = ["Id"])

# numerical columns
X_num_test = ct.fit_transform(X_test)
# PCA on the numerical data
X_pca_test = pca.transform(X_num_test)
# categorical colums that are OHE
X_cat_test = cat_pipeline.transform(X_test[categorical_features_names])


#make predictions which we will submit. 
y_pred = elastic_net.predict(X_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred})
output.to_csv('../data/submission.csv', index=False)

ValueError: could not convert string to float: 'Normal'