Import needed libraries.

In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import gregw
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Bring in train and test data.

In [None]:
train = pd.read_csv("house_prices_train.csv")
test = pd.read_csv("housing_prices_test.csv")

Look for duplicated data.

In [None]:
print "Number in train that are duplicated:", len(train[train.duplicated() == True])
print "Number in test that are duplicated:", len(test[test.duplicated() == True])

Have a look at the sale prices of the homes.

In [None]:
print('The cheapest house sold for ${:,.0f} and the most expensive for ${:,.0f}'.format(
    train.SalePrice.min(), train.SalePrice.max()))
print('The average sales price is ${:,.0f}, while median is ${:,.0f}'.format(
    train.SalePrice.mean(), train.SalePrice.median()))
train.SalePrice.hist(bins=75, rwidth=.8, figsize=(14,4))
plt.title('How expensive are houses?')
plt.show()

Look at how old the homes are.

In [None]:
print('Oldest house built in {}. Newest house built in {}.'.format(
    train.YearBuilt.min(), train.YearBuilt.max()))
train.YearBuilt.hist(bins=14, rwidth=.9, figsize=(12,4))
plt.title('When were the houses built?')
plt.show()

Look at the seasonality of the transactions.

In [None]:
train.groupby(['YrSold','MoSold']).Id.count().plot(kind='bar', figsize=(14,4))
plt.title('When where houses sold?')
plt.show()

Cut out the sale price of the train data.

In [None]:
y = train['SalePrice']
train.drop(['SalePrice'], axis = 1, inplace = True)

Copy and cut out ID of train data.

In [None]:
test_id = test['Id']
alldata = pd.concat([train, test], axis=0)
alldata.drop(['Id'], axis = 1, inplace = True)

Find columns with missing data.

In [None]:
list(alldata.columns[alldata.isnull().any()])

Change the dtype of some of the columns.

In [None]:
alldata['MSSubClass'] = alldata['MSSubClass'].astype(object)
alldata['MoSold'] = alldata['MoSold'].astype(object)
alldata['YrSold'] = alldata['YrSold'].astype(object)
alldata['BsmtFullBath'] = alldata['BsmtFullBath'].astype(object)
alldata['BsmtHalfBath'] = alldata['BsmtHalfBath'].astype(object)

Fill in simple missing values.

In [None]:
alldata.LotFrontage = alldata.LotFrontage.fillna(0)
alldata['GarageCars'] = alldata['GarageCars'].fillna(0)
alldata['GarageArea'] = alldata['GarageArea'].fillna(519)
alldata.Alley = alldata.Alley.fillna('None')
alldata.FireplaceQu = alldata.FireplaceQu.fillna('None')
alldata.GarageType = alldata.GarageType.fillna('None')
alldata['GarageYrBlt'] = alldata['GarageYrBlt'].fillna('None')
alldata['GarageFinish'] = alldata['GarageFinish'].fillna('None')
alldata['GarageQual'] = alldata['GarageQual'].fillna('None')
alldata['GarageCond'] = alldata['GarageCond'].fillna('None')
alldata['PoolQC'] = alldata['PoolQC'].fillna('None')
alldata['Fence'] = alldata['Fence'].fillna('None')
alldata['MiscFeature'] = alldata['MiscFeature'].fillna('None')

Impute values for missing values.

In [None]:
gregw.impute_cal('MSZoning', alldata)
gregw.impute_cal('Utilities', alldata)
gregw.impute_cal('Exterior1st', alldata)
gregw.impute_cal('Exterior2nd', alldata)
gregw.impute_cal('MasVnrType', alldata)
gregw.impute_reg('MasVnrArea', alldata)
gregw.impute_cal('BsmtQual', alldata)
gregw.impute_cal('BsmtCond', alldata)
gregw.impute_cal('BsmtExposure', alldata)
gregw.impute_cal('BsmtFinType1', alldata)
gregw.impute_reg('BsmtFinSF1', alldata)
gregw.impute_reg('BsmtFinSF2', alldata)
gregw.impute_reg('BsmtUnfSF', alldata)
gregw.impute_reg('TotalBsmtSF', alldata)
gregw.impute_cal('BsmtFinType2', alldata)
gregw.impute_cal('Electrical', alldata)
gregw.impute_cal('BsmtFullBath', alldata)
gregw.impute_cal('BsmtHalfBath', alldata)
gregw.impute_cal('KitchenQual', alldata)
gregw.impute_cal('Functional', alldata)
gregw.impute_reg('GarageCars', alldata)
gregw.impute_cal('SaleType', alldata)

Check for anything I may of missed.

In [None]:
alldata.info()

Create dummy values.

In [None]:
#alldata.drop(['MiscVal'], axis = 1, inplace = True)
dummies_data = pd.get_dummies(alldata,drop_first=True)

Split data back into the orginal train and test data.

In [None]:
train = dummies_data.iloc[:1460]
test = dummies_data.iloc[1460:]

Create a baseline using the default paramters of XGBoost.

In [None]:
cross_val_score(XGBRegressor(), train, y, cv = 3).mean()

Create dictionary of paramaters to be tuned and cross validated.

In [None]:
parameters = {'base_score':[.55], 'max_depth': [1,2], 'n_estimators':[1008],
              'min_child_weight' :[1], 'colsample_bytree' :[.8],
              'colsample_bylevel':[0.79, .8, .81]}

Now I will use a three fold cross validation to find the best parameters.

In [None]:
XGBR = GridSearchCV(XGBRegressor(), parameters, n_jobs=-1, cv=3, verbose=1)
XGBR.fit(train, y)

Tuning achieved a 1.9% improvement over the baseline.

In [None]:
print 'Best score for data:', XGBR.best_score_
XGBR.best_estimator_ 

Now that I have found out what the best paramaters are I will use them to make my predictions.

In [None]:
tuned_final = XGBRegressor(base_score=0.55, colsample_bylevel=0.79, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=1008, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(train, y)
answer = pd.DataFrame()
answer['SalePrice'] = tuned_final.predict(test)

Then I will add in the orginal IDs and save the CSV.

In [None]:
final = pd.concat([test_id, answer['SalePrice']], axis=1)
final.to_csv('kaggle_house_prices.csv', index=False)