In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, Lasso, ElasticNet
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline




# Read data and combine the training and test data

In [2]:
data=pd.read_csv("input/train.csv", index_col=0)
real_test=pd.read_csv("input/test.csv", index_col=0)

#combine the training and test data
X=data.iloc[:, 0:-1]
y=data.iloc[:, -1]
y=y.apply(np.log)

frames=[X, real_test]
X=pd.concat(frames)
print('The shape of data is:', X.shape)

The shape of data is: (2919, 79)


# Deal with NaN

In [3]:
def exploration(col):
    return X[col].value_counts()

def most_frequent(col):
    series=exploration(col)
    types=series.index.tolist()
    counts=series.tolist()
    
    id_max=counts.index(max(counts))
    return types[id_max]

def imputation(col, value):
    X.loc[X[col].isnull(), col]=value

#return the feature names with missing values
def show_missing():
    missing=X.columns[X.isnull().any()].tolist()
    return missing

In [4]:
Nan=X[show_missing()].isnull().sum()
#If the Nan appears less than 10 times, replace it with the most frequent value (categorical data) or the mean (numerical)
Nan_feature_1=Nan[Nan<=10].index.tolist()
print(Nan_feature_1)

for f in Nan_feature_1:
    if X[f].dtype==np.object:
        imputation(f, 'None')
    else:
        imputation(f, X[f].mean())

['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 'GarageArea', 'SaleType']


In [5]:
#Fill the NaN of LotFrontage using the sqrt of LotArea
X['SqrtLotArea']=np.sqrt(X['LotArea'])
cond=X['LotFrontage'].isnull()
X.LotFrontage[cond]=X.SqrtLotArea[cond]
del X['SqrtLotArea']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
#Alley: None represents there is no alley
imputation('Alley', 'None')

#MasVnAre and MasVnrType

cond=X['MasVnrArea'].isnull()
X['MasVnrArea'][cond]=0
X['MasVnrType'][cond]='None'

cond=X['MasVnrType'].isnull()
X['MasVnrType']=most_frequent('MasVnrType')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
#None means there is no such attribute for that house
imputation('BsmtQual', 'None')
imputation('BsmtCond', 'None')
imputation('BsmtExposure', 'None')
imputation('BsmtFinType1', 'None')
imputation('BsmtFinType2', 'None')
imputation('FireplaceQu', 'None')
imputation('GarageType', 'None')
imputation('GarageYrBlt', 0)
imputation('GarageFinish', 'None')
imputation('GarageQual', 'None')
imputation('GarageCond', 'None')
imputation('PoolQC', 'None')
imputation('Fence', 'None')
imputation('MiscFeature', 'None')

In [8]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Label the features

In [9]:
X['MSSubClass']=X['MSSubClass'].astype(str)
X['GarageYrBlt']=X['GarageYrBlt'].astype(int)
numerical_feature=X.columns[X.dtypes!='object']
print('number of numerical features:',len(numerical_feature))

#standarize numercical_features
X_numerical=StandardScaler().fit_transform(X[numerical_feature])
X_numerical=pd.DataFrame(X_numerical, columns=numerical_feature)
X_numerical['ID']=list(np.arange(1,len(X_numerical)+1))
X_numerical.head()

number of numerical features: 35


Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,ID
0,-0.358406,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.529034,0.580907,-0.29313,-0.934863,...,-0.74076,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646,1
1,0.159847,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.567016,1.178112,-0.29313,-0.629896,...,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962,2
2,-0.254756,0.137197,0.646183,-0.507284,0.980221,0.848965,0.338903,0.097873,-0.29313,-0.288516,...,-0.74076,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646,3
3,-0.531157,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.567016,-0.494941,-0.29313,-0.047275,...,-0.74076,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569,4
4,0.298048,0.518903,1.355551,-0.507284,0.947203,0.753229,1.390216,0.468931,-0.29313,-0.161068,...,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646,5


# Determine ordinal features and non ordinal features

In [10]:
#determine ordinal features and non ordinal features

non_ordinal_feature=['MSSubClass','MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'Foundation', 'Heating', 'MiscFeature', 'SaleType', 'SaleCondition', 
                     'LandSlope']
print('number of non_ordinal_features', len(non_ordinal_feature))

ordinal_feature=['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                 'Functional', 'FireplaceQu', 'GarageType','GarageFinish',
                 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC',
                 'Fence', 'Street', 'Alley', 'LotShape', 'LandContour',
               'Utilities', 'LotConfig']
print('number of ordinal_features', len(ordinal_feature))

number of non_ordinal_features 18
number of ordinal_features 26


# One hot encode the non ordinal features

In [11]:
X_non_ordinal=pd.get_dummies(X[non_ordinal_feature])
print(X_non_ordinal.shape)
X_non_ordinal['ID']=np.arange(1, len(X_non_ordinal)+1)
X_non_ordinal.head()

(2919, 161)


Unnamed: 0_level_0,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,...,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,ID
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,4
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,5


# Mapping the ordinal features

In [12]:
ordinal_values={}
ordinal_values['Alley']=['None','Grvl','Pave']
ordinal_values['BsmtCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['BsmtExposure']=['None','No', 'Mn', 'Av', 'Gd']
ordinal_values['BsmtFinType1']=['None','Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordinal_values['BsmtFinType2']=['None','Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordinal_values['BsmtQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['BsmtCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['CentralAir']=['N', 'Y']
ordinal_values['Electrical']=['None','Mix', 'FuseP', 'FuseF', 'FuseA',
                            'SBrkr']
ordinal_values['ExterCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['ExterQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['Fence']=['None','MnWw', 'GdWo', 'MnPrv', 'GdPrv']
ordinal_values['FireplaceQu']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['Functional']=['None','Sev', 'Maj2', 'Maj1', 'Mod',
                            'Min2', 'Min1', 'Typ']
ordinal_values['GarageCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['GarageFinish']=['None','Unf', 'RFn', 'Fin']
ordinal_values['GarageQual']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['GarageType']=['None','Detchd','CarPort', 'BuiltIn',
                            'Basment', 'Attchd', '2Types']
ordinal_values['HeatingQC']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['KitchenQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['LandContour']=['None','Low', 'HLS', 'Bnk','Lvl']
ordinal_values['LotConfig']=['None','FR3', 'FR2', 'CulDSac','Corner',
                           'Inside']
ordinal_values['LotShape']=['None','IR3', 'IR2', 'IR1', 'Reg']
ordinal_values['PavedDrive']=['None','N', 'P', 'Y']
ordinal_values['PoolQC']=['None','Fa', 'Gd', 'Ex']
ordinal_values['Street']=['None','Grvl', 'Pave']
ordinal_values['Utilities']=['None','NoSeWa', 'AllPub']
len(ordinal_values)

26

In [13]:
#ordial mapping
ordinal_mapping={}
for f in ordinal_feature:
    mapping={}
    grades=ordinal_values[f]
    n=len(ordinal_values[f])
    
    for i in range(n):
        mapping[grades[i]]=i
    
    ordinal_mapping[f]=mapping

In [14]:
X_ordinal=X[ordinal_feature]
for f in ordinal_feature:
    X_ordinal[f]=X_ordinal[f].map(ordinal_mapping[f])

#standarize the number
X_ordinal=StandardScaler().fit_transform(X_ordinal)
X_ordinal=pd.DataFrame(X_ordinal, columns=ordinal_feature)
X_ordinal['ID']=np.arange(1, len(X_ordinal)+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [15]:
print('numerical shape', X_numerical.shape)
print('non_ordinal shape', X_non_ordinal.shape)
print('ordinal shape', X_ordinal.shape)

X_clean=X_numerical
X_clean=X_clean.merge(X_non_ordinal, left_on='ID', right_on='ID', how='inner')
X_clean=X_clean.merge(X_ordinal, left_on='ID', right_on='ID', how='inner')


del X_clean['ID']
X_clean.shape

numerical shape (2919, 36)
non_ordinal shape (2919, 162)
ordinal shape (2919, 27)


(2919, 222)

# Seperate training and test

In [16]:
X_train=X_clean.iloc[0:1460,:]
X_test=X_clean.iloc[1460:2920, :]

ntrain=X_train.shape[0]
ntest=X_test.shape[0]
NFOLDS=4
SEED=10

kf=KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

# SKlearn Wrappers

In [17]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state']=seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.params=params
        self.params['seed']=seed
        self.nrounds=params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain=xgb.DMatrix(x_train, label=y_train)
        self.gbdt=xgb.train(self.params, dtrain, self.nrounds)
    
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [18]:
def get_oof(clf):
    oof_train=np.zeros((ntrain,))
    oof_test=np.zeros((ntest,))
    oof_test_skf=np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        X_tr = X_train.iloc[train_index,:]
        y_tr = y.iloc[train_index]
        X_te = X_train.iloc[test_index,:]
        
        clf.train(X_tr, y_tr)
        
        oof_train[test_index]=clf.predict(X_te)
        oof_test_skf[i, :] = clf.predict(X_test)
        
    oof_test[:]=oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
        

In [19]:
rf_params={
    'n_estimators':800,
    'max_features':0.3,
    'max_depth':None,
    'min_samples_leaf':2
}

rd_params={
    'alpha':10
}

ls_params={
    'alpha':0.00001
}

xgb1_params={
    'n_estimators':1000,
    'learning_rate':0.075,
    'max_depth':4,
    'seed':0,
    'objective':'reg:linear'
    
}

xgb2_params={
    'n_estimators':500,
    'learning_rate':0.1,
    'max_depth':3,
    'seed':0,
    'objective':'reg:linear'
    
}

et_params={
    'n_jobs':16,
    'n_estimators':100,
    'max_features':0.5,
    'max_depth':12,
    'min_samples_leaf':2
}

gb1_params={
    'n_estimators':500,
    'learning_rate':0.05,
    'min_samples_leaf': 2,
    'max_depth': 4,
}


gb2_params={
    'n_estimators':1000,
    'learning_rate':0.03,
    'min_samples_leaf': 1,
    'max_depth': 5,
}

In [20]:
rd=SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls=SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
rf=SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
xg1=XgbWrapper(seed=SEED, params=xgb1_params)
xg2=XgbWrapper(seed=SEED, params=xgb2_params)
et=SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
gb1=SklearnWrapper(clf=GradientBoostingRegressor, seed=SEED, params=gb1_params)
gb2=SklearnWrapper(clf=GradientBoostingRegressor, seed=SEED, params=gb2_params)

In [21]:
rd_oof_train, rd_oof_test = get_oof(rd)
print("RD-CV:{}".format(np.sqrt(mean_squared_error(y, rd_oof_train))))

RD-CV:0.14156751310916565


In [22]:
ls_oof_train, ls_oof_test = get_oof(ls)
print("LS-CV:{}".format(np.sqrt(mean_squared_error(y, ls_oof_train))))

LS-CV:0.14525089514044706


In [23]:
rf_oof_train, rf_oof_test = get_oof(rf)
print("RF-CV:{}".format(np.sqrt(mean_squared_error(y, rf_oof_train))))

RF-CV:0.1401533877252378


In [24]:
xg1_oof_train, xg1_oof_test = get_oof(xg1)
print("XGB1-CV:{}".format(np.sqrt(mean_squared_error(y, xg1_oof_train))))

XGB1-CV:0.12816541581959198


In [25]:
xg2_oof_train, xg2_oof_test = get_oof(xg2)
print("XGB2-CV:{}".format(np.sqrt(mean_squared_error(y, xg2_oof_train))))

XGB2-CV:0.1279290524569504


In [26]:
et_oof_train, et_oof_test = get_oof(et)
print("ET-CV:{}".format(np.sqrt(mean_squared_error(y, et_oof_train))))

ET-CV:0.14080582399758046


In [27]:
gb1_oof_train, gb1_oof_test = get_oof(gb1)
print("GBR1-CV:{}".format(np.sqrt(mean_squared_error(y, gb1_oof_train))))

GBR1-CV:0.12725946611393413


In [28]:
gb2_oof_train, gb2_oof_test = get_oof(gb2)
print("GBR2-CV:{}".format(np.sqrt(mean_squared_error(y, gb2_oof_train))))

GBR2-CV:0.13012695979572436


In [29]:
X_train_l2=np.concatenate((rd_oof_train, ls_oof_train, rf_oof_train, xg1_oof_train, 
                           xg2_oof_train, et_oof_train, gb1_oof_train, gb2_oof_train), axis=1)
X_test_l2=np.concatenate((rd_oof_test, ls_oof_test, rf_oof_test, xg1_oof_test, 
                          xg2_oof_test, et_oof_test, gb1_oof_test, gb2_oof_test), axis=1)
print("{}, {}".format(X_train_l2.shape, X_test_l2.shape))

(1460, 8), (1459, 8)


In [33]:
dtrain=xgb.DMatrix(X_train_l2, label=y)
dtest=xgb.DMatrix(X_test_l2)

xgb_params={
    'seed':0,
    'learning_rate':0.01,
    'objective': 'reg:linear',
    'max_depth':1,
    'num_parallel_tree':1,
    'min_child_weight':1,
    'eval_metric':'rmse'
}

res=xgb.cv(xgb_params, dtrain, num_boost_round=2000, nfold=4, seed=SEED, stratified=False,
          early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds=res.shape[0]-1
cv_mean=res.iloc[-1,0]
cv_std=res.iloc[-1,1]

print('Ensemble-CV:{0}+{1}'.format(cv_mean, cv_std))

gbdt=xgb.train(xgb_params, dtrain, best_nrounds)
y_pre=gbdt.predict(dtest)

[0]	train-rmse:11.4159+0.00316624	test-rmse:11.4159+0.00963709
[10]	train-rmse:10.3267+0.0028241	test-rmse:10.3267+0.00989543
[20]	train-rmse:9.34165+0.00253001	test-rmse:9.34168+0.00999443
[30]	train-rmse:8.45069+0.00226978	test-rmse:8.45073+0.0101101
[40]	train-rmse:7.64483+0.00203225	test-rmse:7.64495+0.0100731
[50]	train-rmse:6.91593+0.001817	test-rmse:6.91609+0.0100841
[60]	train-rmse:6.25665+0.00162355	test-rmse:6.25681+0.00988492
[70]	train-rmse:5.66033+0.00144943	test-rmse:5.66049+0.00980944
[80]	train-rmse:5.12098+0.00129175	test-rmse:5.12114+0.00950897
[90]	train-rmse:4.63316+0.00114994	test-rmse:4.63334+0.00913126
[100]	train-rmse:4.19194+0.00102167	test-rmse:4.19207+0.00878895
[110]	train-rmse:3.79288+0.000906035	test-rmse:3.79302+0.00843386
[120]	train-rmse:3.43197+0.000803437	test-rmse:3.43208+0.0081116
[130]	train-rmse:3.10557+0.000713161	test-rmse:3.10561+0.00773612
[140]	train-rmse:2.81038+0.00063239	test-rmse:2.81041+0.00737193
[150]	train-rmse:2.54344+0.000559526	tes

# Make submission file

In [35]:
y_submit=np.exp(y_pre)
y_submit

array([ 118530.5703125,  158262.09375  ,  172393.375    , ...,
        164440.09375  ,  115878.046875 ,  220940.3125   ], dtype=float32)

In [36]:
submission=pd.DataFrame(np.arange(1461, 2920), columns=['Id'])
submission['SalePrice']=pd.Series(y_submit)
submission.to_csv(path_or_buf= 
                  'submission_stacking_8_models.csv',index=False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,118530.570312
1,1462,158262.09375
2,1463,172393.375
3,1464,183098.359375
4,1465,190103.6875
