In [1]:
%load_ext autoreload
%autoreload 2

In [89]:
# Here we call the different libraries required. Fastai's library is utilized because a lot of the functions 
# that need to be used are simplified. Finally, two algorithms that we will look at, RandomForestRegressor &
# GradientBoostingRegressor are also imported
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from IPython.display import display
from sklearn import cross_validation, metrics
import graphviz



In [5]:
# The 
PATH = "data/housing/"

In [8]:
# Training and Testing data are initialized
train = pd.read_csv(f'{PATH}train.csv', low_memory = False)
test = pd.read_csv(f'{PATH}test.csv', low_memory = False)
print('train shape ', train.shape)
print('test shape ', test.shape)

train shape  (1460, 81)
test shape  (1459, 80)


In [9]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
# We need to combine both training and testing data in one dataframe for pre-processing. The SalePrice column
# needs to be dropped from the test data
df_combined = pd.concat([train.drop('SalePrice', axis = 1), test])
print(df_combined.shape)

(2919, 80)


In [25]:
# Function to find out the lotfrontage value with the highest occurence in each neighborhood
def mode(df, key_cols, value_col, count_col):
    return df.groupby(key_cols + [value_col]).size().to_frame(count_col).reset_index() \
             .sort_values(count_col, ascending=False).drop_duplicates(subset=key_cols)

In [33]:
# Create DataFrame suitable for pre-processing
def pre_proc_suitable(train, test):
    # create dummy SalePrice column for the test data for now
    test['SalePrice'] = 1
    df_combined = pd.concat([train, test], ignore_index = True)
    return df_combined

In [78]:
def impute(df, cols):
    for x in range(len(cols)):
        fills = mode(df, ['Neighborhood'], cols[x], 'count').reset_index(drop = True)
        for y in range(len(df)):
            Neigh = df.loc[y, 'Neighborhood']
            if pd.isnull(df.loc[y, cols[x]]):
                for z in range(len(fills)):
                    if (fills.loc[z, 'Neighborhood'] == Neigh):
                        df.loc[y, cols[x]] = fills.loc[z, cols[x]]
    return df

In [81]:
# Write function for pre-processing
def pre_proc(df):
    # Drop 'Id' column as it does not decide the SalePrice
    df.drop(['Id'], axis = 1, inplace = True)
    # Make new feature 'Age' of house
    df['Age'] = df['YrSold'] - df['YearBuilt']
    # Make new feature 'PeakSeason' when SalePrices are high
    df['PeakSeason'] = df.MoSold.replace({1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1, 8:0, 9:0, 10:0, 11:0, 12:0})
    # Convert MSSubClass from numerical datatype to string datatype
    df.MSSubClass = df.MSSubClass.astype('object')
    # Impute with values which occur must number of times in the neighborhood that the house is present
    cols_to_impute = ['LotFrontage', 'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1',
                     'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual',
                     'Functional', 'GarageCars', 'GarageArea']
    df = impute(df, cols_to_impute)
    # Change any string datatypes to categorical values
    train_cats(df)
    # Convert dataframe to numerical only by stripping the target column & converting dummy columns 
    df_proc, y, nas = proc_df(df, 'SalePrice', max_n_cat = 15)
    return df_proc, y

In [82]:
# Combine the train and test data
df_combined = pre_proc_suitable(train, test)
# Perform pre-processing on the combined data
processed_df, combined_y = pre_proc(df_combined)

In [83]:
processed_df.shape

(2919, 294)

In [95]:
# Collect back the pre-processed data into training and testing data using shape information from before
df_train = processed_df.iloc[:1460, :]
df_test = processed_df.iloc[1460:,:]
y = combined_y[:1460]

In [98]:
# Log Transformation on SalePrice
y = np.log(y)

In [100]:
# Splitting Original Data into Validation and Training Set
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 500  # Validation Set Size
n_trn = len(df_train)-n_valid
X_train, X_valid = split_vals(df_train, n_trn)
y_train, y_valid = split_vals(y_train, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((960, 294), (960,), (500, 294))

In [101]:
# Printing accuracy and RMSE
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
# A RandomForestRegressor with tuned parameters and cross validation performed on the training set
rf = RandomForestRegressor(n_estimators=6000, n_jobs=-1, oob_score=True,  max_features = 0.5,
                              min_samples_leaf = 2)
rf.fit(X_train, y_train)
print_score(rf)
cv_score_rf = cross_validation.cross_val_score(rf, df_train, y, cv=5); cv_score_rf

In [None]:
np.mean(cv_score_rf)

In [113]:
# A GradientBoostingRegressor with tuned parameters and cross validation performed on the training set
gbm = GradientBoostingRegressor(min_samples_split = 0.001, min_samples_leaf = 20, max_depth = 15, 
                                n_estimators = 4000, max_features = 0.5, subsample = 1.0, learning_rate = 0.015,
                               random_state = 500)
gbm.fit(X_train, y_train)
cv_score_gbm = cross_validation.cross_val_score(gbm, df_train, y_train, cv=5); cv_score_gbm

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.015, loss='ls', max_depth=15,
             max_features=0.5, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=20, min_samples_split=0.001,
             min_weight_fraction_leaf=0.0, n_estimators=4000,
             presort='auto', random_state=500, subsample=1.0, verbose=0,
             warm_start=False)

In [91]:
# Highest score
np.mean(cv_score_gbm)

0.8970969921267689

In [108]:
# Keeping only important features from Random Forest Model
rf_fi = rf_feat_importance(rf, df_train)
to_keep_rf = rf_fi[rf_fi.imp > 0.005].cols
df_train_rf_keep = df_train[to_keep_rf].copy()
df_train_rf_keep.shape

(1460, 23)

In [127]:
rf_new = RandomForestRegressor(n_estimators=6000, n_jobs=-1, oob_score=True,  max_features = 0.5,
                              min_samples_leaf = 2)
rf_new.fit(X_train[to_keep_rf], y_train)
cv_score_rf_new = cross_validation.cross_val_score(rf, df_train_rf_keep, y, cv=5); cv_score_rf_new

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=6000, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [110]:
# Slight Improvement in RandomForest Score after removing features to reduce overfitting
np.mean(cv_score_rf_new)

0.881380839568029

In [114]:
# Keeping only important features from Gradient Boosting Model
gbm_fi = rf_feat_importance(gbm, df_train)
to_keep_gbm = gbm_fi[gbm_fi.imp > 0.005].cols
df_train_gbm_keep = df_train[to_keep_gbm].copy()
df_train_gbm_keep.shape

(1460, 32)

In [None]:
gbm.fit(X_train[to_keep_gbm], y)

In [116]:
cv_score_gbm_new = cross_validation.cross_val_score(rf, df_train_gbm_keep, y, cv=5); cv_score_gbm_new

array([0.87621, 0.87661, 0.87647, 0.88759, 0.85322])

In [118]:
# We don't see an improvement here possibly because Gradient Boosting models sequentially work on those samples
# that didn't fit well in previous trees, hence, all features are utilized more optimally in boosting
np.mean(cv_score_gbm_new)

0.8740197936377999

In [131]:
# We will use the first GBM model to predict the SalePrice of the pre-processed test data since it achieved the 
# highest accuracy of all models
output = gbm.predict(df_test)
output_antilog = np.exp(output)
submission = pd.read_csv(f'{PATH}sample_submission.csv')
saleP = pd.DataFrame(data = output_antilog, columns = ['SalePrice'])
submission.SalePrice = saleP
submission.to_csv('final.csv')
# Remove First blank column before submission