In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read data and split the data to training and test set

In [2]:
data=pd.read_csv("input/train.csv")
real_test=pd.read_csv("input/test.csv")
print('The shape of data is:', data.shape)
data.head()

The shape of data is: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
X=data.iloc[:, 0:-1]
y=data.iloc[:, -1]
y=y.apply(np.log)

frames=[X, real_test]
X=pd.concat(frames)


print("shape of X:", X.shape)
print("shape of y:", y.shape)


all_features=set(X.columns)
print("Total number of features:", len(all_features))

#Seperate number features and string features
g=X.columns.to_series().groupby(X.dtypes).groups
fgroup={k.name: v for k, v in g.items()}

int_feature=fgroup['int64']
float_feature=fgroup['float64']
int_feature.remove('Id')
int_feature.remove('YrSold')
int_feature.remove('MoSold')
string_feature=fgroup['object']

print('number of int number features:',len(int_feature))
print('number of string features:', len(string_feature))


imr=Imputer(missing_values='NaN', strategy='mean', axis=0)
X_int=imr.fit_transform(X[int_feature])
#print('int_features:',int_feature)
#print('float_features:', float_feature)

X_int=pd.DataFrame(X_int, columns=int_feature)
X_int[['YearBuilt']]=2017-X_int[['YearBuilt']]
X_int[['YearRemodAdd']]=2017-X_int[['YearRemodAdd']]
X_int.iloc[1,:]

shape of X: (2919, 80)
shape of y: (1460,)
Total number of features: 80
number of int number features: 23
number of string features: 43


MSSubClass         20
LotArea          9600
OverallQual         6
OverallCond         8
YearBuilt          41
YearRemodAdd       41
1stFlrSF         1262
2ndFlrSF            0
LowQualFinSF        0
GrLivArea        1262
FullBath            2
HalfBath            0
BedroomAbvGr        3
KitchenAbvGr        1
TotRmsAbvGrd        6
Fireplaces          1
WoodDeckSF        298
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
Name: 1, dtype: float64

In [4]:
# Scale the features in int_feature
scale_feature=['LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
               'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
              'PoolArea', 'MiscVal']
unscale_features=[item for item in int_feature if item not in scale_feature]
unscale_features.remove('MSSubClass')

X_scale=StandardScaler().fit_transform(X_int[scale_feature])
X_scale=pd.DataFrame(X_scale, columns=scale_feature)

In [5]:
#Onehotencoder the class
Mssclass=OneHotEncoder().fit_transform(X_int[['MSSubClass']]).toarray()
Mssclass_col=['MSSubClass'+str(i) for i in range(16)]
Mssclass=pd.DataFrame(Mssclass, columns=Mssclass_col)

In [6]:
X_scale=X_scale.join(Mssclass)
X_scale=X_scale.join(X_int[unscale_features])
X_scale.shape

(2919, 38)

# Deal with float data

In [7]:
float_feature

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [8]:
#substitute NaN with mean
imr=Imputer(missing_values='NaN', strategy='mean', axis=0)
X_float=imr.fit_transform(X[float_feature])
X_float=pd.DataFrame(X_float, columns=float_feature)
#standardlize float feature data
X_float=StandardScaler().fit_transform(X_float)
X_float=pd.DataFrame(X_float, columns=float_feature)
X_float.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
0,-0.202068,0.525202,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,1.000929,0.306528,0.3489
1,0.50187,-0.57225,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,-0.085,0.306528,-0.059792
2,-0.06128,0.334828,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.92049,0.306528,0.627553
3,-0.436714,-0.57225,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,0.799831,1.619961,0.785457
4,0.689587,1.387486,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,0.88027,1.619961,1.686437


In [9]:
#combine the integer data and float data
X_scale=X_scale.join(X_float)
X_scale.shape

(2919, 49)

In [10]:
X_scale.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,...,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
0,-0.217879,-1.046258,-0.896833,-0.773861,1.207379,-0.101197,0.413547,-0.74076,0.200006,-0.359601,...,0.525202,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,1.000929,0.306528,0.3489
1,-0.072044,-0.154764,0.395604,0.261075,-0.785025,-0.101197,-0.471891,1.614879,-0.702843,-0.359601,...,-0.57225,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,-0.085,0.306528,-0.059792
2,0.137197,-0.980221,-0.848965,-0.610718,1.235375,-0.101197,0.563755,-0.74076,-0.081209,-0.359601,...,0.334828,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.92049,0.306528,0.627553
3,-0.078385,1.859351,0.682812,-0.506205,0.978742,-0.101197,0.427382,-0.74076,-0.184815,3.874967,...,-0.57225,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,0.799831,1.619961,0.785457
4,0.518903,-0.947203,-0.753229,-0.03717,1.671651,-0.101197,1.378042,0.776967,0.540424,-0.359601,...,1.387486,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,0.88027,1.619961,1.686437


# Deal with string data

In [11]:
X[string_feature].iloc[1,:]

MSZoning              RL
Street              Pave
Alley                NaN
LotShape             Reg
LandContour          Lvl
Utilities         AllPub
LotConfig            FR2
LandSlope            Gtl
Neighborhood     Veenker
Condition1         Feedr
Condition2          Norm
BldgType            1Fam
HouseStyle        1Story
RoofStyle          Gable
RoofMatl         CompShg
Exterior1st      MetalSd
Exterior2nd      MetalSd
MasVnrType          None
ExterQual             TA
ExterCond             TA
Foundation        CBlock
BsmtQual              Gd
BsmtCond              TA
BsmtExposure          Gd
BsmtFinType1         ALQ
BsmtFinType2         Unf
Heating             GasA
HeatingQC             Ex
CentralAir             Y
Electrical         SBrkr
KitchenQual           TA
Functional           Typ
FireplaceQu           TA
GarageType        Attchd
GarageFinish         RFn
GarageQual            TA
GarageCond            TA
PavedDrive             Y
PoolQC               NaN
Fence                NaN


In [12]:
non_ordial_feature=['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'Foundation', 'Heating', 'MiscFeature', 'SaleType', 'SaleCondition', 'LandSlope']
len(non_ordial_feature)


17

In [13]:
ordial_feature=['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                 'Functional', 'FireplaceQu', 'GarageType','GarageFinish',
                 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC',
                 'Fence', 'Street', 'Alley', 'LotShape', 'LandContour',
               'Utilities', 'LotConfig']
len(ordial_feature)

26

In [14]:
len(string_feature)

43

In [15]:
#make sure we don't have any missing or duplicate features
non_ordial_set=set(non_ordial_feature)
ordial_set=set(ordial_feature)
int_set=set(int_feature)
float_set=set(float_feature)
int_set.intersection(float_set)

number_set=int_set.union(float_set)
str_set=non_ordial_set.union(ordial_set)
number_set.intersection(str_set)
com=number_set.union(str_set)

all_set=set(X.columns.values)
print(all_set.difference(com))
print(com.difference(all_set))

{'Id', 'MoSold', 'YrSold'}
set()


In [16]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [17]:
#substitute NaN with most frequent
X_non_ordial=DataFrameImputer().fit_transform(X[non_ordial_feature])

In [18]:
X_non_ordial.shape

(2919, 17)

# Label non_ordial features

In [19]:
#may contain nan ??
non_ordial_feature.remove('MasVnrType')
non_ordial_feature.remove('MiscFeature')

In [21]:
class_mapping_1=[]
for f in non_ordial_feature:
    mapping={label: idx for idx, label in enumerate(np.unique(X[f]))}
    class_mapping_1.append(mapping)
class_mapping_1

X_non_ordial=X[non_ordial_feature]

for i, f in enumerate(non_ordial_feature):
    X_non_ordial[f]=X_non_ordial[f].map(class_mapping_1[i])
X_non_ordial.head()
    
#normalize the data
X_non_ordial=StandardScaler().fit_transform(X_non_ordial)
X_non_ordial=pd.DataFrame(X_non_ordial, columns=non_ordial_feature)
X_non_ordial.head()

MSZoning


TypeError: unorderable types: str() > float()

In [None]:
len(class_mapping_1)

In [None]:
X_scale=X_scale.join(X_non_ordial)
X_scale.shape

# Seperate training and test

In [None]:
X_new=X_scale.iloc[0:1460,:]
real_test_new=X_scale.iloc[1460:2920, :]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=8)
X_train.head()

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lg=LinearRegression()
lg.fit(X_train, y_train)
print("log error of training:", np.sqrt(mean_squared_error(y_train, lg.predict(X_train))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, lg.predict(X_test))))

In [None]:
y_pre=lg.predict(real_test_new)
y_pre=np.exp(y_pre)

In [None]:
real_test_new

In [None]:
submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission_lg.csv',index=False)

# Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
parameters={'max_features':list(np.arange(2,40,5))}
rf=RandomForestRegressor(n_estimators=200, min_samples_leaf=2)
clf=GridSearchCV(rf, parameters, verbose=1, cv=7)
clf.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf.predict(X_test),dtype='float64'))))

In [None]:
clf.best_estimator_

In [None]:
y_pre=clf.predict(real_test_new)
y_pre=np.exp(y_pre)

submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission_randomf_200est_27maxfeature.csv',index=False)

# XGB

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb=XGBRegressor()
parameters={'max_depth':list(np.arange(3,6)), 'learning_rate': [0.05, 0.1, 0.15], 'n_estimators': [100, 200, 500]}
clf=GridSearchCV(xgb, parameters, verbose=1, cv=4)
clf.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf.predict(X_test),dtype='float64'))))

In [None]:
clf.best_estimator_

In [None]:
y_pre=clf.predict(real_test_new)
y_pre=np.exp(y_pre)

submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission_XGB_200est_0.1rate_4depth.csv',index=False)