In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read data and combine the training and test data

In [2]:
data=pd.read_csv("input/train.csv")
real_test=pd.read_csv("input/test.csv")
print('The shape of data is:', data.shape)
data.head()

The shape of data is: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
X=data.iloc[:, 0:-1]
y=data.iloc[:, -1]
y=y.apply(np.log)

frames=[X, real_test]
X=pd.concat(frames)


print("shape of X:", X.shape)
print("shape of y:", y.shape)


all_features=set(X.columns)
print("Total number of features:", len(all_features))

#Seperate number features and string features
g=X.columns.to_series().groupby(X.dtypes).groups
fgroup={k.name: v for k, v in g.items()}

int_feature=fgroup['int64']
float_feature=fgroup['float64']
int_feature.remove('Id')
int_feature.remove('YrSold')
int_feature.remove('MoSold')
string_feature=fgroup['object']

print('number of int number features:',len(int_feature))
print('number of string features:', len(string_feature))


imr=Imputer(missing_values='NaN', strategy='mean', axis=0)
X_int=imr.fit_transform(X[int_feature])
#print('int_features:',int_feature)
#print('float_features:', float_feature)

X_int=pd.DataFrame(X_int, columns=int_feature)
X_int[['YearBuilt']]=2017-X_int[['YearBuilt']]
X_int[['YearRemodAdd']]=2017-X_int[['YearRemodAdd']]

shape of X: (2919, 80)
shape of y: (1460,)
Total number of features: 80
number of int number features: 23
number of string features: 43


In [7]:
# Scale the features in int_feature
#scale_feature=['LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
#               'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
#              'PoolArea', 'MiscVal']
#unscale_features=[item for item in int_feature if item not in scale_feature]

int_feature.remove('MSSubClass')

X_scale=StandardScaler().fit_transform(X_int[int_feature])
X_scale=pd.DataFrame(X_scale, columns=int_feature)

In [8]:
#Onehotencoder the class
Mssclass=OneHotEncoder().fit_transform(X_int[['MSSubClass']]).toarray()
Mssclass_col=['MSSubClass'+str(i) for i in range(16)]
Mssclass=pd.DataFrame(Mssclass, columns=Mssclass_col)

In [9]:
X_scale=X_scale.join(Mssclass)
X_scale.shape

(2919, 38)

# Deal with float data

In [10]:
float_feature

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [11]:
#substitute NaN with mean
imr=Imputer(missing_values='NaN', strategy='mean', axis=0)
X_float=imr.fit_transform(X[float_feature])
X_float=pd.DataFrame(X_float, columns=float_feature)
#standardlize float feature data
X_float=StandardScaler().fit_transform(X_float)
X_float=pd.DataFrame(X_float, columns=float_feature)
X_float.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
0,-0.202068,0.525202,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,1.000929,0.306528,0.3489
1,0.50187,-0.57225,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,-0.085,0.306528,-0.059792
2,-0.06128,0.334828,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.92049,0.306528,0.627553
3,-0.436714,-0.57225,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,0.799831,1.619961,0.785457
4,0.689587,1.387486,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,0.88027,1.619961,1.686437


In [12]:
#combine the integer data and float data
X_scale=X_scale.join(X_float)
X_scale.shape

(2919, 49)

In [13]:
X_scale.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,...,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
0,-0.217879,0.646183,-0.507284,-1.046258,-0.896833,-0.773861,1.207379,-0.101197,0.413547,0.781366,...,0.525202,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,1.000929,0.306528,0.3489
1,-0.072044,-0.063185,2.188279,-0.154764,0.395604,0.261075,-0.785025,-0.101197,-0.471891,0.781366,...,-0.57225,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,-0.085,0.306528,-0.059792
2,0.137197,0.646183,-0.507284,-0.980221,-0.848965,-0.610718,1.235375,-0.101197,0.563755,0.781366,...,0.334828,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.92049,0.306528,0.627553
3,-0.078385,0.646183,-0.507284,1.859351,0.682812,-0.506205,0.978742,-0.101197,0.427382,-1.027363,...,-0.57225,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,0.799831,1.619961,0.785457
4,0.518903,1.355551,-0.507284,-0.947203,-0.753229,-0.03717,1.671651,-0.101197,1.378042,0.781366,...,1.387486,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,0.88027,1.619961,1.686437


# Deal with string data

In [14]:
X[string_feature].iloc[1,:]

MSZoning              RL
Street              Pave
Alley                NaN
LotShape             Reg
LandContour          Lvl
Utilities         AllPub
LotConfig            FR2
LandSlope            Gtl
Neighborhood     Veenker
Condition1         Feedr
Condition2          Norm
BldgType            1Fam
HouseStyle        1Story
RoofStyle          Gable
RoofMatl         CompShg
Exterior1st      MetalSd
Exterior2nd      MetalSd
MasVnrType          None
ExterQual             TA
ExterCond             TA
Foundation        CBlock
BsmtQual              Gd
BsmtCond              TA
BsmtExposure          Gd
BsmtFinType1         ALQ
BsmtFinType2         Unf
Heating             GasA
HeatingQC             Ex
CentralAir             Y
Electrical         SBrkr
KitchenQual           TA
Functional           Typ
FireplaceQu           TA
GarageType        Attchd
GarageFinish         RFn
GarageQual            TA
GarageCond            TA
PavedDrive             Y
PoolQC               NaN
Fence                NaN


In [15]:
non_ordial_feature=['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
                    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'Foundation', 'Heating', 'MiscFeature', 'SaleType', 'SaleCondition', 'LandSlope']
len(non_ordial_feature)


17

In [16]:
ordial_feature=['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                 'Functional', 'FireplaceQu', 'GarageType','GarageFinish',
                 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC',
                 'Fence', 'Street', 'Alley', 'LotShape', 'LandContour',
               'Utilities', 'LotConfig']
len(ordial_feature)

26

In [17]:
len(string_feature)

43

In [18]:
#make sure we don't have any missing or duplicate features
non_ordial_set=set(non_ordial_feature)
ordial_set=set(ordial_feature)
int_set=set(int_feature)
float_set=set(float_feature)
int_set.intersection(float_set)

number_set=int_set.union(float_set)
str_set=non_ordial_set.union(ordial_set)
number_set.intersection(str_set)
com=number_set.union(str_set)

all_set=set(X.columns.values)
print(all_set.difference(com))
print(com.difference(all_set))

{'MSSubClass', 'MoSold', 'YrSold', 'Id'}
set()


In [29]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [44]:
#substitute NaN with most frequent
X_non_ordial=DataFrameImputer().fit_transform(X[non_ordial_feature])
X_non_ordial.iloc[2300,:]

MSZoning              RL
Neighborhood     NridgHt
Condition1          Norm
Condition2          Norm
BldgType            1Fam
HouseStyle        2Story
RoofStyle            Hip
RoofMatl         CompShg
Exterior1st      VinylSd
Exterior2nd      VinylSd
MasVnrType         Stone
Foundation         PConc
Heating             GasA
MiscFeature         Shed
SaleType             New
SaleCondition    Partial
LandSlope            Gtl
Name: 840, dtype: object

In [31]:
X_non_ordial.shape

(2919, 17)

# Label non_ordial features

In [45]:
class_mapping_1=[]
for f in non_ordial_feature:
    mapping={label: idx for idx, label in enumerate(np.unique(X_non_ordial[f]))}
    class_mapping_1.append(mapping)

for i, f in enumerate(non_ordial_feature):
    X_non_ordial[f]=X_non_ordial[f].map(class_mapping_1[i])
X_non_ordial.head()
    
#normalize the data
#X_non_ordial=StandardScaler().fit_transform(X_non_ordial)
#X_non_ordial=pd.DataFrame(X_non_ordial, columns=non_ordial_feature)
#X_non_ordial.head()

X_non_ordial=OneHotEncoder().fit_transform(X_non_ordial).toarray()
non_ordial_col=['non_ordial'+str(i) for i in range(143)]
X_non_ordial=pd.DataFrame(X_non_ordial, columns=non_ordial_col)
X_non_ordial.shape

(2919, 143)

In [46]:
X_scale=X_scale.join(X_non_ordial)
X_scale.shape

(2919, 192)

# Label ordial features 

In [47]:
X_ordial=DataFrameImputer().fit_transform(X[ordial_feature])
ordial_values_1={}

for f in ordial_feature:
    ordial_values_1[f]=list(X_ordial[f].unique())
ordial_values_1

{'Alley': ['Grvl', 'Pave'],
 'BsmtCond': ['TA', 'Gd', 'Fa', 'Po'],
 'BsmtExposure': ['No', 'Gd', 'Mn', 'Av'],
 'BsmtFinType1': ['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', 'LwQ'],
 'BsmtFinType2': ['Unf', 'BLQ', 'ALQ', 'Rec', 'LwQ', 'GLQ'],
 'BsmtQual': ['Gd', 'TA', 'Ex', 'Fa'],
 'CentralAir': ['Y', 'N'],
 'Electrical': ['SBrkr', 'FuseF', 'FuseA', 'FuseP', 'Mix'],
 'ExterCond': ['TA', 'Gd', 'Fa', 'Po', 'Ex'],
 'ExterQual': ['Gd', 'TA', 'Ex', 'Fa'],
 'Fence': ['MnPrv', 'GdWo', 'GdPrv', 'MnWw'],
 'FireplaceQu': ['Gd', 'TA', 'Fa', 'Ex', 'Po'],
 'Functional': ['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev'],
 'GarageCond': ['TA', 'Fa', 'Gd', 'Po', 'Ex'],
 'GarageFinish': ['RFn', 'Unf', 'Fin'],
 'GarageQual': ['TA', 'Fa', 'Gd', 'Ex', 'Po'],
 'GarageType': ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', '2Types'],
 'HeatingQC': ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
 'KitchenQual': ['Gd', 'TA', 'Ex', 'Fa'],
 'LandContour': ['Lvl', 'Bnk', 'Low', 'HLS'],
 'LotConfig': ['Inside', 'FR2', 'Corner', 'C

In [48]:
ordial_values={}
ordial_values['Alley']=['Grvl','Pave']
ordial_values['BsmtCond']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['BsmtExposure']=['No', 'Mn', 'Av', 'Gd']
ordial_values['BsmtFinType1']=['Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordial_values['BsmtFinType2']=['Unf', 'LwQ', 'Rec', 'BLQ',
                              'ALQ', 'GLQ']
ordial_values['BsmtQual']=['Fa', 'TA', 'Gd', 'Ex']
ordial_values['BsmtCond']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['CentralAir']=['N', 'Y']
ordial_values['Electrical']=['Mix', 'FuseP', 'FuseF', 'FuseA',
                            'SBrkr']
ordial_values['ExterCond']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['ExterQual']=['Fa', 'TA', 'Gd', 'Ex']
ordial_values['Fence']=['MnWw', 'GdWo', 'MnPrv', 'GdPrv']
ordial_values['FireplaceQu']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['Functional']=['Sev', 'Maj2', 'Maj1', 'Mod',
                            'Min2', 'Min1', 'Typ']
ordial_values['GarageCond']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['GarageFinish']=['Unf', 'RFn', 'Fin']
ordial_values['GarageQual']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['GarageType']=['Detchd','CarPort', 'BuiltIn',
                            'Basment', 'Attchd', '2Types']
ordial_values['HeatingQC']=['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordial_values['KitchenQual']=['Fa', 'TA', 'Gd', 'Ex']
ordial_values['LandContour']=['Low', 'HLS', 'Bnk','Lvl']
ordial_values['LotConfig']=['FR3', 'FR2', 'CulDSac','Corner',
                           'Inside']
ordial_values['LotShape']=['IR3', 'IR2', 'IR1', 'Reg']
ordial_values['PavedDrive']=['N', 'P', 'Y']
ordial_values['PoolQC']=['Fa', 'Gd', 'Ex']
ordial_values['Street']=['Grvl', 'Pave']
ordial_values['Utilities']=['NoSeWa', 'AllPub']

#ordial mapping
ordial_mapping={}
for f in ordial_feature:
    mapping={}
    grades=ordial_values[f]
    n=len(ordial_values[f])
    
    for i in range(n):
        mapping[grades[i]]=i
    
    ordial_mapping[f]=mapping

In [49]:
#map the ordial string features to numbers
for f in ordial_feature:
    X_ordial[f]=X_ordial[f].map(ordial_mapping[f])

In [50]:
#standarize the number
X_ordial=StandardScaler().fit_transform(X_ordial)
X_ordial=pd.DataFrame(X_ordial, columns=ordial_feature)
X_ordial.head()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,Electrical,...,GarageCond,PavedDrive,PoolQC,Fence,Street,Alley,LotShape,LandContour,Utilities,LotConfig
0,1.039805,-0.230047,0.631705,-0.009441,-0.627142,1.172283,-0.323051,0.885619,0.26829,0.278267,...,0.120517,0.31503,0.042784,0.019986,0.064249,-0.165696,0.7019,0.308471,0.018512,0.522154
1,-0.683756,-0.230047,0.631705,-0.009441,2.258765,0.690122,-0.323051,0.885619,0.26829,0.278267,...,0.120517,0.31503,0.042784,0.019986,0.064249,-0.165696,0.7019,0.308471,0.018512,-3.372655
2,1.039805,-0.230047,0.631705,-0.009441,0.334827,1.172283,-0.323051,0.885619,0.26829,0.278267,...,0.120517,0.31503,0.042784,0.019986,0.064249,-0.165696,-1.052249,0.308471,0.018512,0.522154
3,-0.683756,-0.230047,-0.806631,3.435397,-0.627142,0.690122,-0.323051,-0.158453,0.26829,0.278267,...,0.120517,0.31503,0.042784,0.019986,0.064249,-0.165696,-1.052249,0.308471,0.018512,-0.776115
4,1.039805,-0.230047,0.631705,-0.009441,1.296796,1.172283,-0.323051,0.885619,0.26829,0.278267,...,0.120517,0.31503,0.042784,0.019986,0.064249,-0.165696,-1.052249,0.308471,0.018512,-3.372655


In [51]:
X_scale=X_scale.join(X_ordial)
X_scale.shape

(2919, 218)

# Seperate training and test

In [52]:
X_new=X_scale.iloc[0:1460,:]
real_test_new=X_scale.iloc[1460:2920, :]

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=6)
X_train.iloc[2,:]

LotArea         -0.243495
OverallQual     -2.191288
OverallCond     -2.304326
YearBuilt        1.892369
YearRemodAdd     1.640173
1stFlrSF        -0.753468
2ndFlrSF        -0.785025
LowQualFinSF    -0.101197
GrLivArea       -1.258507
FullBath        -1.027363
HalfBath        -0.756321
BedroomAbvGr    -1.045801
KitchenAbvGr    -0.207698
TotRmsAbvGrd    -0.925062
Fireplaces      -0.924311
WoodDeckSF      -0.740760
OpenPorchSF     -0.702843
EnclosedPorch    1.197226
3SsnPorch       -0.103331
ScreenPorch     -0.285935
PoolArea        -0.063150
MiscVal         -0.089592
MSSubClass0      0.000000
MSSubClass1      1.000000
MSSubClass2      0.000000
MSSubClass3      0.000000
MSSubClass4      0.000000
MSSubClass5      0.000000
MSSubClass6      0.000000
MSSubClass7      0.000000
                   ...   
non_ordial139    0.000000
non_ordial140    1.000000
non_ordial141    0.000000
non_ordial142    0.000000
ExterQual       -0.683756
ExterCond       -0.230047
BsmtQual        -0.806631
BsmtCond    

# Linear Regression

In [70]:
from sklearn.linear_model import LinearRegression
lg=LinearRegression()
lg.fit(X_train, y_train)
print("log error of training:", np.sqrt(mean_squared_error(y_train, lg.predict(X_train))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, lg.predict(X_test))))

log error of training: 0.110112608085
log error of test: 15662134492.5


In [140]:
y_pre=lg.predict(real_test_new)
y_pre=np.exp(y_pre)

In [25]:
submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission_lg_except_ordial_data.csv',index=False)

# Random Forest

In [57]:
from sklearn.model_selection import GridSearchCV
parameters={'max_features':list(np.arange(2,40,5))}
rf=RandomForestRegressor(n_estimators=200, min_samples_leaf=2)
clf=GridSearchCV(rf, parameters, verbose=1, cv=7)
clf.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf.predict(X_test),dtype='float64'))))

Fitting 7 folds for each of 8 candidates, totalling 56 fits


[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:  1.3min finished


log error of training: 0.0737653365222
log error of test: 0.129737786384


In [58]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=32, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [143]:
y_pre=clf.predict(real_test_new)
y_pre=np.exp(y_pre)

submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission_randomf_200est_37maxfeature_allfeatures.csv',index=False)

# XGB

In [59]:
from xgboost import XGBRegressor

In [60]:
xgb=XGBRegressor()
parameters={'max_depth':list(np.arange(3,6)), 'learning_rate': [0.01,0.03, 0.05, 0.1], 'n_estimators': [500]}
clf=GridSearchCV(xgb, parameters, verbose=1, cv=4)
clf.fit(X_train, y_train)

print("log error of training:", np.sqrt(mean_squared_error(y_train, np.asarray(clf.predict(X_train), dtype='float64'))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, np.asarray(clf.predict(X_test),dtype='float64'))))

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.3min finished


log error of training: 0.0598968804576
log error of test: 0.130064230354


In [61]:
clf.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [78]:
y_pre_rf=clf_rf.predict(real_test_new)
y_pre_rf=np.exp(y_pre_rf)

#submission=pd.DataFrame(real_test['Id'])
#submission['SalePrice']=y_pre
#submission.tail()
#submission.to_csv(path_or_buf= 'submission_XGB_500est_0.05rate_3depth_.csv',index=False)

# Ridge linear regression

In [None]:
from