In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# Read data and combine the training and test data

In [2]:
data=pd.read_csv("input/train.csv", index_col=0)
real_test=pd.read_csv("input/test.csv", index_col=0)

#combine the training and test data
X=data.iloc[:, 0:-1]
y=data.iloc[:, -1]
y=y.apply(np.log)

frames=[X, real_test]
X=pd.concat(frames)
print('The shape of data is:', X.shape)

The shape of data is: (2919, 79)


# Deal with NaN

In [3]:
def exploration(col):
    return X[col].value_counts()

def most_frequent(col):
    series=exploration(col)
    types=series.index.tolist()
    counts=series.tolist()
    
    id_max=counts.index(max(counts))
    return types[id_max]

def imputation(col, value):
    X.loc[X[col].isnull(), col]=value

#return the feature names with missing values
def show_missing():
    missing=X.columns[X.isnull().any()].tolist()
    return missing

In [4]:
Nan=X[show_missing()].isnull().sum()
#If the Nan appears less than 10 times, replace it with the most frequent value (categorical data) or the mean (numerical)
Nan_feature_1=Nan[Nan<=10].index.tolist()
print(Nan_feature_1)

for f in Nan_feature_1:
    if X[f].dtype==np.object:
        imputation(f, 'None')
    else:
        imputation(f, X[f].mean())

['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 'GarageArea', 'SaleType']


In [5]:
#Fill the NaN of LotFrontage using the sqrt of LotArea
X['SqrtLotArea']=np.sqrt(X['LotArea'])
cond=X['LotFrontage'].isnull()
X.LotFrontage[cond]=X.SqrtLotArea[cond]
del X['SqrtLotArea']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
#Alley: None represents there is no alley
imputation('Alley', 'None')

#MasVnAre and MasVnrType

cond=X['MasVnrArea'].isnull()
X['MasVnrArea'][cond]=0
X['MasVnrType'][cond]='None'

cond=X['MasVnrType'].isnull()
X['MasVnrType']=most_frequent('MasVnrType')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
#None means there is no such attribute for that house
imputation('BsmtQual', 'None')
imputation('BsmtCond', 'None')
imputation('BsmtExposure', 'None')
imputation('BsmtFinType1', 'None')
imputation('BsmtFinType2', 'None')
imputation('FireplaceQu', 'None')
imputation('GarageType', 'None')
imputation('GarageYrBlt', 0)
imputation('GarageFinish', 'None')
imputation('GarageQual', 'None')
imputation('GarageCond', 'None')
imputation('PoolQC', 'None')
imputation('Fence', 'None')
imputation('MiscFeature', 'None')

In [8]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Label the features

In [9]:
X['MSSubClass']=X['MSSubClass'].astype(str)
X['GarageYrBlt']=X['GarageYrBlt'].astype(int)
numerical_feature=X.columns[X.dtypes!='object']
print('number of numerical features:',len(numerical_feature))

#standarize numercical_features
X_numerical=StandardScaler().fit_transform(X[numerical_feature])
X_numerical=pd.DataFrame(X_numerical, columns=numerical_feature)
X_numerical['ID']=list(np.arange(1,len(X_numerical)+1))
X_numerical.head()

number of numerical features: 35


Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,ID
0,-0.358406,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.529034,0.580907,-0.29313,-0.934863,...,-0.74076,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646,1
1,0.159847,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.567016,1.178112,-0.29313,-0.629896,...,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962,2
2,-0.254756,0.137197,0.646183,-0.507284,0.980221,0.848965,0.338903,0.097873,-0.29313,-0.288516,...,-0.74076,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646,3
3,-0.531157,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.567016,-0.494941,-0.29313,-0.047275,...,-0.74076,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569,4
4,0.298048,0.518903,1.355551,-0.507284,0.947203,0.753229,1.390216,0.468931,-0.29313,-0.161068,...,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646,5


# Determine ordinal features and non ordinal features

In [10]:
#determine ordinal features and non ordinal features

non_ordinal_feature=['MSSubClass','MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'Foundation', 'Heating', 'MiscFeature', 'SaleType', 'SaleCondition', 
                     'LandSlope']
print('number of non_ordinal_features', len(non_ordinal_feature))

ordinal_feature=['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                 'Functional', 'FireplaceQu', 'GarageType','GarageFinish',
                 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC',
                 'Fence', 'Street', 'Alley', 'LotShape', 'LandContour',
               'Utilities', 'LotConfig']
print('number of ordinal_features', len(ordinal_feature))

number of non_ordinal_features 18
number of ordinal_features 26


# One hot encode the non ordinal features

In [11]:
X_non_ordinal=pd.get_dummies(X[non_ordinal_feature])
print(X_non_ordinal.shape)
X_non_ordinal['ID']=np.arange(1, len(X_non_ordinal)+1)
X_non_ordinal.head()

(2919, 161)


Unnamed: 0_level_0,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,...,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,ID
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,4
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,5


# Mapping the ordinal features

In [12]:
ordinal_values={}
ordinal_values['Alley']=['None','Grvl','Pave']
ordinal_values['BsmtCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['BsmtExposure']=['None','No', 'Mn', 'Av', 'Gd']
ordinal_values['BsmtFinType1']=['None','Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordinal_values['BsmtFinType2']=['None','Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordinal_values['BsmtQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['BsmtCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['CentralAir']=['N', 'Y']
ordinal_values['Electrical']=['None','Mix', 'FuseP', 'FuseF', 'FuseA',
                            'SBrkr']
ordinal_values['ExterCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['ExterQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['Fence']=['None','MnWw', 'GdWo', 'MnPrv', 'GdPrv']
ordinal_values['FireplaceQu']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['Functional']=['None','Sev', 'Maj2', 'Maj1', 'Mod',
                            'Min2', 'Min1', 'Typ']
ordinal_values['GarageCond']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['GarageFinish']=['None','Unf', 'RFn', 'Fin']
ordinal_values['GarageQual']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['GarageType']=['None','Detchd','CarPort', 'BuiltIn',
                            'Basment', 'Attchd', '2Types']
ordinal_values['HeatingQC']=['None','Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_values['KitchenQual']=['None','Fa', 'TA', 'Gd', 'Ex']
ordinal_values['LandContour']=['None','Low', 'HLS', 'Bnk','Lvl']
ordinal_values['LotConfig']=['None','FR3', 'FR2', 'CulDSac','Corner',
                           'Inside']
ordinal_values['LotShape']=['None','IR3', 'IR2', 'IR1', 'Reg']
ordinal_values['PavedDrive']=['None','N', 'P', 'Y']
ordinal_values['PoolQC']=['None','Fa', 'Gd', 'Ex']
ordinal_values['Street']=['None','Grvl', 'Pave']
ordinal_values['Utilities']=['None','NoSeWa', 'AllPub']
len(ordinal_values)

26

In [13]:
#ordial mapping
ordinal_mapping={}
for f in ordinal_feature:
    mapping={}
    grades=ordinal_values[f]
    n=len(ordinal_values[f])
    
    for i in range(n):
        mapping[grades[i]]=i
    
    ordinal_mapping[f]=mapping

In [14]:
X_ordinal=X[ordinal_feature]
for f in ordinal_feature:
    X_ordinal[f]=X_ordinal[f].map(ordinal_mapping[f])

#standarize the number
X_ordinal=StandardScaler().fit_transform(X_ordinal)
X_ordinal=pd.DataFrame(X_ordinal, columns=ordinal_feature)
X_ordinal['ID']=np.arange(1, len(X_ordinal)+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [15]:
print('numerical shape', X_numerical.shape)
print('non_ordinal shape', X_non_ordinal.shape)
print('ordinal shape', X_ordinal.shape)

X_clean=X_numerical
X_clean=X_clean.merge(X_non_ordinal, left_on='ID', right_on='ID', how='inner')
X_clean=X_clean.merge(X_ordinal, left_on='ID', right_on='ID', how='inner')


del X_clean['ID']
X_clean.shape

numerical shape (2919, 36)
non_ordinal shape (2919, 162)
ordinal shape (2919, 27)


(2919, 222)

# Seperate training and test

In [16]:
X_new=X_clean.iloc[0:1460,:]
real_test_new=X_clean.iloc[1460:2920, :]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=8)

# Linear Regression

In [17]:
#not feasible here
from sklearn.linear_model import LinearRegression
lg=LinearRegression()
lg.fit(X_train, y_train)
print("log error of training:", np.sqrt(mean_squared_error(y_train, lg.predict(X_train))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, lg.predict(X_test))))

log error of training: 0.0951279605655
log error of test: 50148319.4928


# XGB 

In [53]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

xgb=XGBRegressor()
parameters={'max_depth':list(np.arange(3,6)), 'learning_rate': np.arange(0.01, 0.1, 0.01), 'n_estimators': [500]}
clf_xgb=GridSearchCV(xgb, parameters, verbose=1, cv=4)
clf_xgb.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf_xgb.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf_xgb.predict(X_test),dtype='float64'))))

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  5.2min finished


log error of training: 0.0544381008761
log error of test: 0.124738939014


In [57]:
clf_xgb.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.060000000000000005, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [76]:
y_pre_xgb=clf_xgb.predict(real_test_new)
y_pre_xgb=np.exp(y_pre_xgb)

# Random Forest

In [51]:
parameters={'max_features':list(np.arange(20,40,2))}
rf=RandomForestRegressor(n_estimators=200, min_samples_leaf=2)
clf_rf=GridSearchCV(rf, parameters, verbose=1, cv=7)
clf_rf.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf_rf.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf_rf.predict(X_test),dtype='float64'))))

Fitting 7 folds for each of 10 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  2.2min finished


log error of training: 0.0719929345215
log error of test: 0.130108305858


In [52]:
clf_rf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=38, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [75]:
y_pre_rf=clf_rf.predict(real_test_new)
y_pre_rf=np.exp(y_pre_rf)

# Ridge linear regression

In [49]:
from sklearn.linear_model import Ridge
parameters={'alpha':np.arange(50, 200, 10)}
rdg=Ridge()

clf_rdg=GridSearchCV(rdg, parameters, verbose=1, cv=10)
clf_rdg.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf_rdg.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf_rdg.predict(X_test),dtype='float64'))))

Fitting 10 folds for each of 15 candidates, totalling 150 fits
log error of training: 0.127488494176
log error of test: 0.136768457036


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    2.8s finished


In [50]:
clf_rdg.best_estimator_

Ridge(alpha=140, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [74]:
y_pre_rdg=clf_rdg.predict(real_test_new)
y_pre_rdg=np.exp(y_pre_rdg)

# Lasso

In [47]:
from sklearn.linear_model import Lasso

parameters={'alpha':np.arange(0.0001, 0.002, 0.0001)}
las=Lasso()

clf_las=GridSearchCV(las, parameters, verbose=1, cv=10)
clf_las.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf_las.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf_las.predict(X_test),dtype='float64'))))

Fitting 10 folds for each of 19 candidates, totalling 190 fits


[Parallel(n_jobs=1)]: Done 190 out of 190 | elapsed:   19.4s finished


log error of training: 0.10305093953
log error of test: 0.132023438984


In [48]:
clf_las.best_estimator_

Lasso(alpha=0.00040000000000000002, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [73]:
y_pre_las=clf_las.predict(real_test_new)
y_pre_las=np.exp(y_pre_las)

# SVR

In [70]:
from sklearn.svm import SVR
parameters={'C': [0.05], 'epsilon': np.arange(0.001, 0.02, 0.003)}
svr=SVR()

clf_svr=GridSearchCV(svr, parameters, verbose=1, cv=10)
clf_svr.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf_svr.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf_svr.predict(X_test),dtype='float64'))))

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  1.9min finished


log error of training: 0.147603564578
log error of test: 0.136803895748


In [71]:
clf_svr.best_estimator_

SVR(C=0.05, cache_size=200, coef0=0.0, degree=3,
  epsilon=0.0070000000000000001, gamma='auto', kernel='rbf', max_iter=-1,
  shrinking=True, tol=0.001, verbose=False)

In [72]:
y_pre_svr=clf_svr.predict(real_test_new)
y_pre_svr=np.exp(y_pre_svr)

# Average

In [77]:
y_pre=(y_pre_xgb+y_pre_rf+y_pre_rdg+y_pre_las+y_pre_svr)/5

submission=pd.DataFrame(np.arange(1461, 2920), columns=['Id'])
submission['SalePrice']=y_pre
len(submission)
submission.head()

submission.to_csv(path_or_buf= 'submission_average_xgb_lasso_rf_ridge_svr_all_features.csv',index=False)