In [19]:
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV, cross_val_score)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer, RobustScaler)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from scipy import stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import AveragedModels as av
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import ml_helper as mlh
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import lightgbm as lgb

In [2]:
# constants and helper methods

CONDITIONS_DICT = {"NA": 0, "NaN": 0, "nan": 0, "Po": 2, "Fa": 3, "TA": 4, "Gd":6, "Ex": 10}

# constants
CATEGORY_LABELS = {"KitchenQual":       CONDITIONS_DICT,
                    "GarageCond":       CONDITIONS_DICT,
                    "GarageQual":       CONDITIONS_DICT,
                    "ExterQual":        CONDITIONS_DICT,
                    "ExterCond":        CONDITIONS_DICT,
                    "BsmtQual":         CONDITIONS_DICT,
                    "BsmtCond":         CONDITIONS_DICT,
                    "FireplaceQu" :     CONDITIONS_DICT,
                    "HeatingQC" :       CONDITIONS_DICT,
                    "LotConfig":     {"Inside": 0, "Corner": 6, "CulDSac": 10, "FR2": 3, "FR3":4},
                    "Utilities":     {"ELO": 0, "NoSeWa": 1, "NoSewr": 2, "AllPub": 3},
                    "LandSlope":     {"Gtl": 10, "Mod": 4, "Sev": 1},
                    "LotShape":     {"Reg": 10, "IR1": 5, "IR2": 3, "IR3": 1},
                    "GarageType":     {"NA": 0, "nan": 0, "Basment": 4,  "Detchd": 1, "CarPort": 3, "BuiltIn": 5, "Attchd": 7, "2Types": 12},
                    "BldgType":     {"TwnhsI": 1, "Twnhs": 2, "TwnhsE": 3, "Duplex": 5,  "2fmCon": 7, "1Fam": 12},
                    "CentralAir":     {"N": 1, "Y": 10},
                    "Electrical":     {"Mix": 1, "FuseP": 3, "FuseF": 5,  "FuseA": 7, "SBrkr": 12},
                    "MSZoning":     {"RL": 100, "RM": 60, "C (all)": 20, "FV": 30, "RH": 30},
                    "LandContour":     {"Lvl": 100, "Low": 15, "Bnk": 25, "HLS": 5},
                    "Fence":     {"NA": 0, "MnPrv": 25, "MnWw": 15, "GdWo": 40, 'GdPrv': 100},
                    "Functional":     {"Typ": 100, "Min1": 70, "Min2": 50, "Mod": 40, "Maj1": 25, "Maj2": 20, "Min2": 10, "Sev": 5, "Sal": 1},
                    "MiscFeature":     {"NA": 0, "Shed": 30, "Gar2": 40, "Othr": 25, "TenC": 100},
                    "PavedDrive":     {"Y": 100, "P": 30, "N": 0},
                    }

CAT_COLS_TO_IGNORE = ["Functional",
                        "MiscFeature",
                        "Electrical",
                        "Fence",
                        "FireplaceQu",
                        "HeatingQC"           
                    ]

CAT_COLS = [ x for x in CATEGORY_LABELS.keys() if x not in CAT_COLS_TO_IGNORE]
print(CAT_COLS)

# plot correlations
def plotCoorelations(df):
    # remove non_numeric features  
    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_IsolationForest(X, y, outlierFraction = 0.02):
    clf = IsolationForest( behaviour = 'new', contamination = outlierFraction)
    preds = clf.fit_predict(X)
    outliers = np.where(preds == -1)
    return dropOutliers(X, y, outliers)

def dropOutliers(X, y, outliers):
    print("number of outliers = {0}".format(len(outliers[0])))
    # drop outliers
    X_clean = np.delete(X, outliers[0], axis = 0)
    y_clean = np.delete(y.values, outliers[0])
    return X_clean, y_clean, outliers[0] 

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_ZScore(X, y, zValue = 3, bypass=False):
    if bypass:
        return X, y, []
    z = np.abs(stats.zscore(X))
    outliers = np.where(z > zValue)
    return dropOutliers(X, y, outliers)

def getTransformedColumnNames(ct):
    for item in ct.named_transformers_:
       pipeline = ct.named_transformers_[item]
       for step in pipeline.named_steps:
           t1 = pipeline.named_steps[step]
           print(t1.get_feature_names())
            
  
def displayScoresExp1p(scores):
    print("Scores:", np.expm1(scores))
    print("Mean:", np.expm1(scores.mean()))
    print("standard deviation:", np.expm1(scores.std()))
    
def displayScores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("standard deviation:", scores.std())
    
def columnsWithMissingData(X, threshold = 0.9):
    # check for null items
    null_df = X.columns[X.isnull().any()]
    null_count = X[null_df].isnull().sum()/len(X.index)
    null_count_above_threshold = null_count.loc[null_count > threshold]
    null_count_above_threshold
    
    #percentage of zero values for each numeric variable
    zero_df = X.columns[(X == 0).any()]
    zero_count = (X[zero_df] == 0).sum()/len(X.index)
    zero_count_above_threshold = zero_count.loc[zero_count > threshold]
    return pd.concat([null_count_above_threshold, zero_count_above_threshold])

def crossValidateModel(model, X, y, name="<unknown>", cv = 5):
    start = time.time()
    scores = cross_val_score(model, X, y, scoring = "neg_mean_absolute_error", n_jobs = -1, verbose = 4, cv)
    end = time.time()
    elapsed_time = end - start
    print("model {0} cross_val_score took {1} seconds".format(name, elapsed_time))
    displayScores(-scores)


['KitchenQual', 'GarageCond', 'GarageQual', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'LotConfig', 'Utilities', 'LandSlope', 'LotShape', 'GarageType', 'BldgType', 'CentralAir', 'MSZoning', 'LandContour', 'PavedDrive']


In [3]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)

In [None]:
Y = home_data["SalePrice"]
columns_to_ignore = ["Id", "1stFlrSF", "2ndFlrSF", "GarageYrBlt", "GarageArea", "TotalBsmtSF", "TotRmsAbvGrd"]
X = home_data.drop(columns = columns_to_ignore)
X = X.drop(columns = ["SalePrice"])

In [6]:
# perform feature scaling for numerical features
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "OpenPorchSF"]

log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('logscaler', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())])

#numeric features that require a normal transformation
numeric_features_names = [x for x in num_features_names if x not in log_features_names]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

transformers=[
        ('log', log_pipeline, log_features_names),
        ('num', numeric_pipeline, numeric_features_names),  
    ]

# ensure that result is always a dense matrix
ct = ColumnTransformer(transformers=transformers, sparse_threshold = 0)
Xt = ct.fit_transform(X)

In [8]:
# use PCA to fit and transform the data using a 0.95 variance
pca = PCA(0.99)
X_pca_t = pca.fit_transform(Xt)
print("Xt shape: ", Xt.shape)
print("X_pca_t shape: ", X_pca_t.shape)

Xt shape:  (1460, 30)
X_pca_t shape:  (1460, 26)


In [9]:
categorical_features_names = X.select_dtypes(include=object).columns
#categorical_features_names = CAT_COLS

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

X_cat_t = cat_pipeline.fit_transform(X[categorical_features_names])

# remove corresponding outlier rows from the categorical data
X_cat_t2 = np.delete(X_cat_t.todense(), outliers, axis=0)


In [10]:
# concatenate the numerical and one hot encoded categorical data
Xt = np.concatenate((X_pca_t, X_cat_t2 ), axis = 1)
print("concatenated Xt shape: ", Xt.shape)

concatenated Xt shape:  (1460, 282)


In [14]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 0.1, solver="auto")
crossValidateModel(ridge_reg, Xt, Y, "Ridge regression")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model ridge cross_val_score took 2.8813681602478027 seconds
Scores: [16905.59386017 17561.30405217 18038.42793387 20740.01297549
 21017.02874298 18901.88766255 16570.16643303 17096.61020592
 24536.2374048  15907.62357156]
Mean: 18727.489284254734
standard deviation: 2519.5983733462613


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.3s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.4s finished


In [15]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.1)
crossValidateModel(lasso_reg, Xt, Y, "Lasso regression")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    4.9s remaining:    3.2s


model Lasso regression cross_val_score took 6.298163414001465 seconds
Scores: [16670.53151506 17320.29554    18121.53714252 20788.2428677
 21242.19237775 19429.3214528  16562.02687746 16971.54945768
 24720.73981431 15872.88643936]
Mean: 18769.932348464492
standard deviation: 2635.598606767343


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.9s finished


In [16]:
from sklearn.linear_model import ElasticNet
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
crossValidateModel(elastic_net, Xt, Y, "Elastic Net")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model Elastic Net cross_val_score took 4.349373817443848 seconds
Scores: [16044.72852165 16401.7084421  16411.76633519 19492.74849727
 19437.52476923 15332.74858197 15857.43083867 16114.53391832
 22687.22429867 17004.62803543]
Mean: 17478.50422384968
standard deviation: 2206.7160619819147


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.9s finished


In [18]:
# try xgboost\n",
xgb_model = xgb.XGBRegressor(colsample_bytree=0.8, subsample=0.5,
                            learning_rate=0.05, max_depth=5,
                              min_child_weight=1.8, n_estimators=700,
                               reg_alpha=0.9, reg_lambda=0.9, gamma=0.01, 
                              silent=1, random_state =7, nthread = -1, refit = True)

crossValidateModel(xgb_model, Xt, Y, "xgb")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.0min remaining:   41.8s


model xgb cross_val_score took 92.5908932685852 seconds
Scores: [13354.99842145 15525.23828125 12745.27723673 18759.93637628
 17686.33489405 13101.69889769 13227.56785103 13957.05934289
 19565.36344178 15527.70077055]
Mean: 15345.117551369864
standard deviation: 2391.274453708356


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

est=GradientBoostingRegressor(n_estimators=600, max_depth=5, loss='ls',min_samples_split=2,learning_rate=0.1, verbose = 5).fit(Xt, np.log(Y))
y_pred = est.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': np.exp(y_pred)})
output.to_csv('../data/submission.csv', index=False)

In [22]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

adaboost_model = AdaBoostRegressor(base_estimator= elastic_net, random_state=5, n_estimators=1000)
adaboost_model.fit(Xt, np.log(Y))
y_pred = adaboost_model.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': np.exp(y_pred)})
output.to_csv('../data/submission.csv', index=False)