In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

import warnings

warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
train_df = pd.read_csv("../input/train.csv")
# id's and timestamps are unnecessary since I don't use macro data
train_df = train_df.drop(['id', 'timestamp'], axis=1)

# replace binary data with 1's and 0's to improve one-hot encoding performance
train_df.replace(['Investment', 'OwnerOccupier', 'yes', 'no'], [1, 0, 1, 0],
                 inplace=True)

print('Train set has {} samples and {} features.'.format(*train_df.shape))
train_df.head()

In [None]:
train_df.describe().round(decimals=2)

In [None]:
test_df = pd.read_csv("../input/test.csv")
# Save id's for the output
id_test = test_df.id

test_df = test_df.drop(['id', 'timestamp'], axis=1)
test_df.replace(['Investment', 'OwnerOccupier', 'yes', 'no'], [1, 0, 1, 0],
                inplace=True)

print('Test set has {} samples and {} features.'.format(*test_df.shape))
test_df.head()

In [None]:
count = 0

for d in train_df.isnull().any():
    if d: count += 1

print('There are {} columns in training set that have missing values.'.format(
    count, train_df.shape[1]))

# Perform one-hot encoding

In order to apply XGBoost prediction model, we need to perform one-hot encoding on the categorical data.

In [None]:
cols = train_df.columns
num_cols = train_df._get_numeric_data().columns
cols = list(set(cols) - set(num_cols))

train_df = pd.get_dummies(train_df, columns=cols)
train_df.head()

In [None]:
test_df = pd.get_dummies(test_df, columns=cols)
test_df.head()

In [None]:
set(train_df) - set(test_df) - set(['price_doc'])

Test set is missing one column with encoded city name, so we have to add it to the data frame in order for the predictor to operate correctly.

In [None]:
test_df['sub_area_Poselenie Klenovskoe'] = [0] * test_df.shape[0]

In [None]:
plt.plot(range(train_df.shape[0]), sorted(train_df['price_doc']))
plt.show()

In [None]:
sns.distplot(train_df['price_doc'], kde=False)
plt.show()

In [None]:
q1 = train_df['price_doc'].quantile(0.99)

train_df_no = train_df.loc[train_df['price_doc'] < q1]
plt.plot(range(train_df_no.shape[0]), sorted(train_df_no['price_doc']))
plt.show()

In [None]:
sns.distplot(train_df_no['price_doc'], kde=False)
plt.show()

In [None]:
target = train_df_no['price_doc']
train = train_df_no.drop('price_doc', axis=1)
test = test_df

xgtrain = xgb.DMatrix(train.values, target.values)
xgtest = xgb.DMatrix(test.values)

Using cross-validation I try to find the optimal parameters for the algorithm based on RMSE on the training set.

In [None]:
params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

cv_output = xgb.cv(params, xgtrain, num_boost_round=1000,
                   early_stopping_rounds=20, verbose_eval=50, show_stdv=False)

In [None]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(params, silent=0), xgtrain,
                  num_boost_round=num_boost_rounds)
y_predict = model.predict(xgtest)

In [None]:
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
output.to_csv('xgb_submission.csv', index=False)