In [28]:
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV, cross_val_score)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, RobustScaler, OneHotEncoder, FunctionTransformer, KBinsDiscretizer)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from scipy import stats
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  # Matlab-style plotting
import time
import AveragedModels as av
import xgboost as xgb
import lightgbm as lgb
import ml_helper2 as mlh

In [41]:
# constants and helper methods

CONDITIONS_DICT = {"NA": 0, "NaN": 0, "nan": 0, "Po": 2, "Fa": 3, "TA": 4, "Gd":6, "Ex": 10}

# constants
CATEGORY_LABELS = {"KitchenQual":       CONDITIONS_DICT,
                    "GarageCond":       CONDITIONS_DICT,
                    "GarageQual":       CONDITIONS_DICT,
                    "ExterQual":        CONDITIONS_DICT,
                    "ExterCond":        CONDITIONS_DICT,
                    "BsmtQual":         CONDITIONS_DICT,
                    "BsmtCond":         CONDITIONS_DICT,
                    "FireplaceQu" :     CONDITIONS_DICT,
                    "HeatingQC" :       CONDITIONS_DICT,
                    "LotConfig":     {"Inside": 0, "Corner": 6, "CulDSac": 10, "FR2": 3, "FR3":4},
                    "Utilities":     {"ELO": 0, "NoSeWa": 1, "NoSewr": 2, "AllPub": 3},
                    "LandSlope":     {"Gtl": 10, "Mod": 4, "Sev": 1},
                    "LotShape":     {"Reg": 10, "IR1": 5, "IR2": 3, "IR3": 1},
                    "GarageType":     {"NA": 0, "nan": 0, "Basment": 4,  "Detchd": 1, "CarPort": 3, "BuiltIn": 5, "Attchd": 7, "2Types": 12},
                    "BldgType":     {"TwnhsI": 1, "Twnhs": 2, "TwnhsE": 3, "Duplex": 5,  "2fmCon": 7, "1Fam": 12},
                    "CentralAir":     {"N": 1, "Y": 10},
                    "Electrical":     {"Mix": 1, "FuseP": 3, "FuseF": 5,  "FuseA": 7, "SBrkr": 12},
                    "MSZoning":     {"RL": 100, "RM": 60, "C (all)": 20, "FV": 30, "RH": 30},
                    "LandContour":     {"Lvl": 100, "Low": 15, "Bnk": 25, "HLS": 5},
                    "Fence":     {"NA": 0, "MnPrv": 25, "MnWw": 15, "GdWo": 40, 'GdPrv': 100},
                    "Functional":     {"Typ": 100, "Min1": 70, "Min2": 50, "Mod": 40, "Maj1": 25, "Maj2": 20, "Min2": 10, "Sev": 5, "Sal": 1},
                    "MiscFeature":     {"NA": 0, "Shed": 30, "Gar2": 40, "Othr": 25, "TenC": 100},
                    "PavedDrive":     {"Y": 100, "P": 30, "N": 0},
                    }

CAT_COLS_TO_IGNORE = ["Functional",
                        "MiscFeature",
                        "Electrical",
                        "Fence",
                        "FireplaceQu",
                        "HeatingQC"           
                    ]

CAT_COLS = [ x for x in CATEGORY_LABELS.keys() if x not in CAT_COLS_TO_IGNORE]
print(CAT_COLS)

# plot correlations
def plotCoorelations(df):
    # remove non_numeric features  
    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_IsolationForest(X, y, outlierFraction = 0.02):
    clf = IsolationForest( behaviour = 'new', contamination = outlierFraction)
    preds = clf.fit_predict(X)
    outliers = np.where(preds == -1)
    return dropOutliers(X, y, outliers)

def dropOutliers(X, y, outliers):
    print("number of outliers = {0}".format(len(outliers[0])))
    # drop outliers
    X_clean = np.delete(X, outliers[0], axis = 0)
    y_clean = np.delete(y.values, outliers[0])
    return X_clean, y_clean, outliers[0] 

# define a method to use Isolation Forest for outlier detection
def outlierRemoval_ZScore(X, y, zValue = 3, bypass=False):
    if bypass:
        return X, y, []
    z = np.abs(stats.zscore(X))
    outliers = np.where(z > zValue)
    return dropOutliers(X, y, outliers)

def getTransformedColumnNames(ct):
    for item in ct.named_transformers_:
       pipeline = ct.named_transformers_[item]
       for step in pipeline.named_steps:
           t1 = pipeline.named_steps[step]
           print(t1.get_feature_names())
            
  
def displayScoresExp1p(scores):
    print("Scores:", np.expm1(scores))
    print("Mean:", np.expm1(scores.mean()))
    print("standard deviation:", np.expm1(scores.std()))
    
def displayScores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("standard deviation:", scores.std())
    
def columnsWithMissingData(X, threshold = 0.9):
    # check for null items
    null_df = X.columns[X.isnull().any()]
    null_count = X[null_df].isnull().sum()/len(X.index)
    null_count_above_threshold = null_count.loc[null_count > threshold]
    null_count_above_threshold
    
    #percentage of zero values for each numeric variable
    zero_df = X.columns[(X == 0).any()]
    zero_count = (X[zero_df] == 0).sum()/len(X.index)
    zero_count_above_threshold = zero_count.loc[zero_count > threshold]
    return pd.concat([null_count_above_threshold, zero_count_above_threshold])

def crossValidateModel(model, X, y, name="<unknown>", nthreads = -1):
    start = time.time()
    scores = cross_val_score(model, X, y, scoring = "neg_mean_absolute_error", n_jobs = nthreads, verbose = 10, cv = 5)
    end = time.time()
    elapsed_time = end - start
    print("model {0} cross_val_score took {1} seconds".format(name, elapsed_time))
    displayScores(-scores)


['KitchenQual', 'GarageCond', 'GarageQual', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'LotConfig', 'Utilities', 'LandSlope', 'LotShape', 'GarageType', 'BldgType', 'CentralAir', 'MSZoning', 'LandContour', 'PavedDrive']


In [30]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)



In [31]:
Y = home_data["SalePrice"]
X = home_data.drop(columns = ["Id", "SalePrice"])

In [32]:
# find missing data.. remove colums that have > 90% data missing
missingData = columnsWithMissingData(X, threshold = 0.9)
print("following columns have greater than 90% data missing or NULL \n",missingData)
print("original df shape = ", X.shape)
X = X.drop(columns = missingData.keys())
print("final df shape = ", X.shape)
   

following columns have greater than 90% data missing or NULL 
 Alley           0.937671
PoolQC          0.995205
MiscFeature     0.963014
LowQualFinSF    0.982192
BsmtHalfBath    0.943836
3SsnPorch       0.983562
ScreenPorch     0.920548
PoolArea        0.995205
MiscVal         0.964384
dtype: float64
original df shape =  (1460, 79)
final df shape =  (1460, 70)


In [33]:
# perform feature scaling for numerical features
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "OpenPorchSF"]

log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('logscaler', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', RobustScaler())])

#numeric features that require a normal transformation
numeric_features_names = [x for x in num_features_names if x not in log_features_names]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())])

transformers=[
        ('log', log_pipeline, log_features_names),
        ('num', numeric_pipeline, numeric_features_names),  
    ]

# ensure that result is always a dense matrix
ct = ColumnTransformer(transformers=transformers, sparse_threshold = 0)
Xt = ct.fit_transform(X)

In [34]:
# remove outliers from numeric data
Xt, Y, outliers = outlierRemoval_ZScore(Xt, Y, 8, bypass=True)

In [35]:
# use PCA to fit and transform the data using a 0.95 variance
#pca = PCA(0.99)
X_pca_t = Xt
print("Xt shape: ", Xt.shape)
print("X_pca_t shape: ", X_pca_t.shape)

Xt shape:  (1460, 30)
X_pca_t shape:  (1460, 30)


In [36]:
categorical_features_names = X.select_dtypes(include=object).columns
#categorical_features_names = CAT_COLS

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

X_cat_t = cat_pipeline.fit_transform(X[categorical_features_names])

# remove corresponding outlier rows from the categorical data
X_cat_t2 = np.delete(X_cat_t.todense(), outliers, axis=0)


In [37]:
# concatenate the numerical and one hot encoded categorical data
Xt = np.concatenate((X_pca_t, X_cat_t2 ), axis = 1)
print("concatenated Xt shape: ", Xt.shape)

concatenated Xt shape:  (1460, 286)


In [38]:
# try xgboost\n",
xgb_model = xgb.XGBRegressor(colsample_bytree=0.8, subsample=0.5,
                            learning_rate=0.05, max_depth=5,
                              min_child_weight=1.8, n_estimators=500,
                               reg_alpha=0.9, reg_lambda=0.9, gamma=0.001, 
                              silent=1, random_state =7, nthread = -1, refit = True)

crossValidateModel(xgb_model, Xt, Y, "xgb")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   18.1s remaining:   27.2s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   18.2s remaining:   12.1s


model xgb cross_val_score took 28.092942714691162 seconds
Scores: [14448.54759739 15613.00917701 15384.99277611 12693.11966235
 15511.87829088]
Mean: 14730.309500749145
standard deviation: 1099.831886343229


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.7s finished


In [45]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                                 learning_rate=0.05, n_estimators=720,
                                 max_bin = 55, bagging_fraction = 0.8,
                                 bagging_freq = 5, feature_fraction = 0.2319,
                                 feature_fraction_seed=9, bagging_seed=9,
                                 min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

crossValidateModel(model_lgb, Xt, Y, "lightgbm", 1) # 1 job

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ............................... , score=-14622.205, total=   0.3s
[CV]  ................................................................
[CV] ............................... , score=-16395.725, total=   0.2s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s



[CV]  ................................................................
[CV] ............................... , score=-16166.830, total=   0.3s
[CV]  ................................................................
[CV] ............................... , score=-13678.097, total=   0.2s

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s



[CV]  ................................................................
[CV] ............................... , score=-16310.203, total=   0.3s
model lightgbm cross_val_score took 1.2451128959655762 seconds
Scores: [14622.20450507 16395.72494209 16166.82998958 13678.09667234
 16310.20252795]
Mean: 15434.611727405258
standard deviation: 1092.8762124313746


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished


In [20]:
# use the grid serach results to create the final predictor for the test
elastic_net= ElasticNet(alpha = 0.2, l1_ratio = 0.9)
elastic_net.fit(Xt,Y)

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

X_test = test_data.drop(columns = ["Id"])
X_test = test_data.drop(columns = missingData.keys())
# numerical columns
X_num_test = ct.fit_transform(X_test)
# PCA on the numerical data
X_pca_test = X_num_test
# categorical colums that are OHE
X_cat_test = cat_pipeline.transform(X_test[categorical_features_names])

X_final_test = np.concatenate((X_pca_test, X_cat_test.todense() ), axis = 1)
#make predictions which we will submit. 
y_pred = elastic_net.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred})
output.to_csv('../data/submission.csv', index=False)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

est=GradientBoostingRegressor(n_estimators=600, max_depth=5, loss='ls',min_samples_split=2,learning_rate=0.1, verbose = 5).fit(Xt, np.log(Y))
y_pred = est.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': np.exp(y_pred)})
output.to_csv('../data/submission.csv', index=False)

In [22]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

adaboost_model = AdaBoostRegressor(base_estimator= elastic_net, random_state=5, n_estimators=1000)
adaboost_model.fit(Xt, np.log(Y))
y_pred = adaboost_model.predict(X_final_test)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': np.exp(y_pred)})
output.to_csv('../data/submission.csv', index=False)