In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

In [2]:
def num_missing(data):
    ms_counts = data.isnull().sum()
    return ms_counts[ms_counts!=0]

In [3]:
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')

In [4]:
raw_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
raw_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [7]:
missing = num_missing(raw_train)
missing

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [8]:
raw_train[missing.index.values].dtypes

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [9]:
train = raw_train.copy()
test = raw_test.copy()

In [10]:
train['MSSubClass'] = train['MSSubClass'].astype('object')
test['MSSubClass'] = test['MSSubClass'].astype('object')

In [11]:
train = pd.get_dummies(train,drop_first=True)
test = pd.get_dummies(test,drop_first=True)
train, test = train.align(test,join='left',axis=1)

In [12]:
train_X = train.drop(columns=['Id','SalePrice'])
train_y = train.SalePrice
test_X = test.drop(columns=['Id','SalePrice'])

In [16]:
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
xgb1 = XGBRegressor()
parameters = {'learning_rate':[0.01],'n_estimators':[3460],
                                     'max_depth':[3], 'min_child_weight':[0],
                                     'gamma':[0], 'subsample':[0.7],
                                     'colsample_bytree':[0.7],
                                     'objective':['reg:linear'], 'nthread':[-1],
                                     'scale_pos_weight':[1], 'seed':[27],
                                     #'reg_alpha'=[0.00006]
             }
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = -1,
                        verbose=True)
model = Pipeline([
    ('scale',StandardScaler()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('reg', xgb_grid)
])
model.fit(train_X,train_y)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   29.9s finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='most_frequent',
                               verbose=0)),
                ('reg',
                 GridSearchCV(cv=2, error_score='raise-deprecating',
                              estimator=XGBRegressor(base_score=0.5,
                                                     booster='gbtree',
                                                     colsample_bylevel=...
                              iid='warn', n_jobs=-1,
                              param_grid={'colsample_bytree': [0.7],
                                          'gamma': [0], 'learning_rate': [0.01],
                                          'max_depth': [3],
                                          'min_child_weight': [0],
         

In [17]:
from sklearn.metrics import mean_absolute_error
preds = model.predict(train_X)
mean_absolute_error(preds,train_y)

6556.308609803083

In [18]:
test_preds = model.predict(test_X)
output = pd.DataFrame({'Id':test.Id,'SalePrice':test_preds})
output.to_csv('submission.csv',index=False)