In [19]:
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV, cross_val_score)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, RobustScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import (ElasticNet, Ridge, Lasso)
from scipy import stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import AveragedModels as av
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import ml_helper as mlh
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import lightgbm as lgb

In [20]:
# constants and helper methods

CONDITIONS_DICT = {"NA": 0, "NaN": 0, "nan": 0, "Po": 2, "Fa": 3, "TA": 4, "Gd":6, "Ex": 10}

# constants
CATEGORY_LABELS = {"KitchenQual":       CONDITIONS_DICT,
                    "GarageCond":       CONDITIONS_DICT,
                    "GarageQual":       CONDITIONS_DICT,
                    "ExterQual":        CONDITIONS_DICT,
                    "ExterCond":        CONDITIONS_DICT,
                    "BsmtQual":         CONDITIONS_DICT,
                    "BsmtCond":         CONDITIONS_DICT,
                    "FireplaceQu" :     CONDITIONS_DICT,
                    "HeatingQC" :       CONDITIONS_DICT,
                    "LotConfig":     {"Inside": 0, "Corner": 6, "CulDSac": 10, "FR2": 3, "FR3":4},
                    "Utilities":     {"ELO": 0, "NoSeWa": 1, "NoSewr": 2, "AllPub": 3},
                    "LandSlope":     {"Gtl": 10, "Mod": 4, "Sev": 1},
                    "LotShape":     {"Reg": 10, "IR1": 5, "IR2": 3, "IR3": 1},
                    "GarageType":     {"NA": 0, "nan": 0, "Basment": 4,  "Detchd": 1, "CarPort": 3, "BuiltIn": 5, "Attchd": 7, "2Types": 12},
                    "BldgType":     {"TwnhsI": 1, "Twnhs": 2, "TwnhsE": 3, "Duplex": 5,  "2fmCon": 7, "1Fam": 12},
                    "CentralAir":     {"N": 1, "Y": 10},
                    "Electrical":     {"Mix": 1, "FuseP": 3, "FuseF": 5,  "FuseA": 7, "SBrkr": 12},
                    "MSZoning":     {"RL": 100, "RM": 60, "C (all)": 20, "FV": 30, "RH": 30},
                    "LandContour":     {"Lvl": 100, "Low": 15, "Bnk": 25, "HLS": 5},
                    "Fence":     {"NA": 0, "MnPrv": 25, "MnWw": 15, "GdWo": 40, 'GdPrv': 100},
                    "Functional":     {"Typ": 100, "Min1": 70, "Min2": 50, "Mod": 40, "Maj1": 25, "Maj2": 20, "Min2": 10, "Sev": 5, "Sal": 1},
                    "MiscFeature":     {"NA": 0, "Shed": 30, "Gar2": 40, "Othr": 25, "TenC": 100},
                    "PavedDrive":     {"Y": 100, "P": 30, "N": 0},
                    }

CAT_COLS_TO_IGNORE = ["Functional",
                        "MiscFeature",
                        "Electrical",
                        "Fence",
                        "FireplaceQu",
                        "HeatingQC"           
                    ]

CAT_COLS = [ x for x in CATEGORY_LABELS.keys() if x not in CAT_COLS_TO_IGNORE]

# plot correlations
def plotCoorelations(df):
    # remove non_numeric features  
    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# define a method to use Isolation Forest for outlier detection
def outlierDetect_IsolationForest(X, outlierFraction = 0.02):
    clf = IsolationForest( behaviour = 'new', contamination = outlierFraction)
    preds = clf.fit_predict(X)
    return np.where(preds == -1)

# define a method to use Isolation Forest for outlier detection
def outlierDetect_ZScore(X, zValue = 3):
    z = np.abs(stats.zscore(X))
    return np.where(z > zValue)
    
def displayScoresExp1p(scores):
    print("Scores:", np.expm1(scores))
    print("Mean:", np.expm1(scores.mean()))
    print("standard deviation:", np.expm1(scores.std()))
    
def displayScores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("standard deviation:", scores.std())
    
def arrayStats(arrayToInspect):
    print("min:", np.amin(arrayToInspect, axis=1))
    print("max:", np.amax(arrayToInspect, axis=1))
    
def crossValidateModel(model, X, y, name="<unknown>", crossVal = 5):
    start = time.time()
    scores = cross_val_score(model, X, y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv = crossVal)
    end = time.time()
    elapsed_time = end - start
    print("model {0} cross_val_score took {1} seconds".format(name, elapsed_time))
    displayScores(-scores)
    
def columnsWithMissingData(X, threshold = 0.9):
    # check for null items
    null_df = X.columns[X.isnull().any()]
    null_count = X[null_df].isnull().sum()/len(X.index)
    null_count_above_threshold = null_count.loc[null_count > threshold]
    null_count_above_threshold
    
    #percentage of zero values for each numeric variable
    zero_df = X.columns[(X == 0).any()]
    zero_count = (X[zero_df] == 0).sum()/len(X.index)
    zero_count_above_threshold = zero_count.loc[zero_count > threshold]
    return pd.concat([null_count_above_threshold, zero_count_above_threshold])


In [21]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)

In [22]:
# drop outliers
home_data = home_data.drop(home_data['LotFrontage']
                                     [home_data['LotFrontage']>200].index)
home_data = home_data.drop(home_data['LotArea']
                                     [home_data['LotArea']>100000].index)
home_data = home_data.drop(home_data['BsmtFinSF1']
                                     [home_data['BsmtFinSF1']>4000].index)
home_data = home_data.drop(home_data['TotalBsmtSF']
                                     [home_data['TotalBsmtSF']>6000].index)
home_data = home_data.drop(home_data['1stFlrSF']
                                     [home_data['1stFlrSF']>4000].index)
home_data = home_data.drop(home_data.LowQualFinSF
                                     [home_data['LowQualFinSF']>550].index)

In [23]:
#home_data.replace(CATEGORY_LABELS, inplace=True)

In [24]:
Y = home_data["SalePrice"]
columns_to_ignore = ["Id", "1stFlrSF", "2ndFlrSF", "GarageYrBlt", "GarageArea", "TotalBsmtSF", "TotRmsAbvGrd"]
X = home_data.drop(columns = columns_to_ignore)
X = X.drop(columns = ["SalePrice"])

In [25]:
# find missing data.. remove colums that have > 90% data missing
missingData = columnsWithMissingData(X, threshold = 0.9)
print("following columns have greater than 90% data missing or NULL \n",missingData)
print("original df shape = ", X.shape)
X = X.drop(columns = missingData.keys())
print("final df shape = ", X.shape)

following columns have greater than 90% data missing or NULL 
 Alley           0.937371
PoolQC          0.995871
MiscFeature     0.964212
LowQualFinSF    0.982794
BsmtHalfBath    0.944253
3SsnPorch       0.983482
ScreenPorch     0.920853
PoolArea        0.995871
MiscVal         0.965588
dtype: float64
original df shape =  (1453, 73)
final df shape =  (1453, 64)


In [26]:
# perform feature scaling for numerical features
# Steps: 1. scale columns requiring log tranformations & feature scaling
#        2. scale columns requiring normal numerical & feature scaling
#        3. concatenate the results of 1 & 2 and then apply PCA for dimensionality reduction
#        
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "GrLivArea", "OpenPorchSF"]

log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 0) ),
    ('logscaler', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', RobustScaler())])

#numeric features that require a normal transformation
numeric_features_names = [x for x in num_features_names if x not in log_features_names]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())])

num_transformers=[
        ('log', log_pipeline, log_features_names),
        ('num', numeric_pipeline, numeric_features_names),  
    ]

# ensure that result is always a dense matrix
num_col_transformer = ColumnTransformer(transformers=num_transformers, sparse_threshold = 0)

pca_pipeline = Pipeline(steps = [
                       ('num_ct', num_col_transformer), # apply the columntransformation
                     # ('pca', PCA(0.99)) # apply PCA to data (log + numeric)
                       ])

print(X.shape)

X_pca_t = pca_pipeline.fit_transform(X)

(1453, 64)


In [27]:
cat_features_names = X.select_dtypes(include=object).columns
#cat_features_names = CAT_COLS

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# ensure that result is always a dense matrix
X_cat_t = cat_pipeline.fit_transform(X[cat_features_names])

In [28]:
# concatenate the numerical and one hot encoded categorical data
Xt = np.concatenate((X_pca_t, X_cat_t.todense() ), axis = 1)
print("concatenated Xt shape: ", Xt.shape)

concatenated Xt shape:  (1453, 279)


In [29]:


#rf = RandomForestRegressor(random_state=1, n_estimators = 300)



In [30]:
# try xgboost
xgb_model = xgb.XGBRegressor(colsample_bytree=0.8, subsample=0.5,
                             learning_rate=0.05, max_depth=5, 
                             min_child_weight=1.8, n_estimators=700,
                             reg_alpha=0.9, reg_lambda=0.9, gamma=0.01, 
                             silent=1, random_state =7, nthread = -1, refit = True)

crossValidateModel(xgb_model, Xt, Y, "xgb", crossVal = 5)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   20.7s remaining:   31.2s


model xgb cross_val_score took 21.606999397277832 seconds
Scores: [14272.44022498 16497.28641806 16828.97477717 13679.10806843
 14966.9826778 ]
Mean: 15248.958433287786
standard deviation: 1229.0079503730358


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.5s finished


In [32]:
# try lightgbm

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

crossValidateModel(model_lgb, Xt, Y, "lightgbm", crossVal = 5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


model lightgbm cross_val_score took 3.2480008602142334 seconds
Scores: [15064.26461589 15890.98702418 15978.90011449 13390.03451107
 15923.23113996]
Mean: 15249.483481118408
standard deviation: 988.8748989468207


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.8s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


In [33]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

crossValidateModel(GBoost, Xt, Y, "gboost", crossVal = 5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   17.2s remaining:   25.8s


model gboost cross_val_score took 18.118000984191895 seconds
Scores: [13951.60447371 16570.35474789 15780.72794483 12958.34744944
 15740.82824381]
Mean: 15000.37257193725
standard deviation: 1333.555718189555


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.7s finished


In [44]:
averaged_models = av.AveragingModels(models = (GBoost, model_lgb, xgb_model))

(GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=15, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='auto', random_state=5,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False), LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2319, feature_fraction_seed=9,
              importance_type='split', learning_rate=0.05, max_bin=55,
              max_depth=-1, min_child_samples=20, min_child_weight=

In [37]:
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9, max_iter = 5000)
crossValidateModel(elastic_net, Xt, Y, "elasticnet")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


model elasticnet cross_val_score took 2.760103702545166 seconds
Scores: [16030.8790045  18247.91017923 16607.89928757 15009.87747313
 17090.26522252]
Mean: 16597.36623338838
standard deviation: 1077.8111170617822


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.6s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished


In [41]:
lasso_reg = Lasso(alpha = 0.1, max_iter = 5000)
crossValidateModel(lasso_reg, Xt, Y, "lasso")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.1s remaining:   10.7s


model lasso cross_val_score took 8.222692728042603 seconds
Scores: [17501.7688862  20610.35382631 20272.59015596 17575.72286984
 18581.49186997]
Mean: 18908.3855216559
standard deviation: 1312.9485866954244


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


In [42]:
stacked_averaged_models = av.StackingAveragedModels(base_models = (GBoost, model_lgb, xgb_model),
                                                 meta_model = lasso_reg)

In [46]:
# use the grid serach results to create the final predictor for the test
averaged_models.fit(Xt,np.log1p(Y.values))

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)
print(test_data.shape)
X_test = test_data.drop(columns = columns_to_ignore)
X_test = X_test.drop(columns = missingData.keys())
print(X_test.shape)
X_pca_test = pca_pipeline.transform(X_test)
# categorical colums that are OHE
X_cat_test = cat_pipeline.transform(X_test[cat_features_names])

X_final_test = np.concatenate((X_pca_test, X_cat_test.todense() ), axis = 1)
#make predictions which we will submit. 
y_pred = np.expm1(averaged_models.predict(X_final_test))
print(y_pred)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred})
output.to_csv('../data/submission.csv', index=False)

(1459, 80)
(1459, 64)
[GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=15, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='auto', random_state=5,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False), LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2319, feature_fraction_seed=9,
              importance_type='split', learning_rate=0.05, max_bin=55,
              max_depth=-1, min_child_samples

In [None]:
# other models that are not as good.
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9, max_iter = 5000)
crossValidateModel(elastic_net, Xt, Y, "elasticnet")

ridge_reg = Ridge(alpha = 0.1, solver="auto", max_iter = 5000)
crossValidateModel(ridge_reg, Xt, Y, "ridge")

lasso_reg = Lasso(alpha = 0.1, max_iter = 5000)
crossValidateModel(lasso_reg, Xt, Y, "lasso")

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
crossValidateModel(KRR, Xt, Y, "krr")