### This is the first look at xgb in this competition.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn import model_selection, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split
%matplotlib inline
import xgboost as xgb
import seaborn as sns

In [None]:
total = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
macro = pd.read_csv('../input/macro.csv')

df_total = pd.merge(total, macro, on='timestamp', how='left')
df_total.drop(['id','price_doc'], axis = 1, inplace = True)
Ytotal = np.log1p(total['price_doc']).as_matrix()

df_test = pd.merge(test, macro, on='timestamp', how='left')
df_test.drop('id', axis = 1, inplace = True)
df_all = pd.concat([df_total,df_test], keys = ['total','test'])

print ('df_total: ', df_total.shape)
print ('df_test: ', df_test.shape)
print ('macro: ', macro.shape)
print ('df_all: ', df_all.shape)

In [None]:
def missingPattern(df):
    numGroup = list(df._get_numeric_data().columns)
    catGroup = list(set(df.columns) - set(numGroup))
    print('Total categorical/numerical variables are %s/%s' % (len(catGroup), len(numGroup)))
    
    #missing data
    n = df.shape[0]
    count = df.isnull().sum()
    percent = 1.0 * count / n
    dtype = df.dtypes
    # correlation
    missing_data = pd.concat([count, percent,dtype], axis=1, keys=['Count', 'Percent', 'Type'])
    missing_data.sort_values('Count', ascending = False, inplace = True)
    missing_data = missing_data[missing_data['Count'] > 0]
    print('Total missing columns is %s' % len(missing_data))

    return numGroup, catGroup, missing_data

# numGroup, catGroup, missing_data = missingPattern(df_all)
# missing_data

In [None]:
drop_list = ['timestamp']
for i in drop_list:
    df_all.drop(i, axis = 1, inplace = True)

    
numGroup,catGroup,_ = missingPattern(df_all)

# self-define numGroup
# numGroup = numFeats

df_total_num = df_all.ix['total',numGroup]
df_test_num = df_all.ix['test',numGroup]
# df_test_num.drop('price_doc', axis = 1, inplace = True)
# df_test_num['id'] = test['id']
df_total_cat = df_all.ix['total',catGroup]
df_test_cat = df_all.ix['test',catGroup]
print('Current training numerical variables count is %d '  %(df_total_num.shape[1]))
print('Current training categorical variables count is %d '  %(df_total_cat.shape[1]))
print('Current test numerical variables count is %d '  %(df_test_num.shape[1]))
print('Current test categorical variables count is %d '  %(df_test_cat.shape[1]))

In [None]:
# one-hot encoding for categorical variables

df_concat_cat = pd.concat([df_total_cat,df_test_cat],keys = ['total','test'])
df_total_cat = pd.get_dummies(df_concat_cat).ix['total',:]
df_test_cat = pd.get_dummies(df_concat_cat).ix['test',:]


print('After one-hot encoding, total training cat variables are %d' %(df_total_cat.shape[1]))
print('After one-hot encoding, total test cat variables are %d' %(df_test_cat.shape[1]))



In [None]:
Xtrain = pd.concat([df_total_num,df_total_cat])
Xtest = pd.concat([df_test_num,df_test_cat])

dtrain = xgb.DMatrix(Xtrain, Ytotal)
dtest = xgb.DMatrix(Xtest)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

In [None]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 13))
xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)

### Prediction

In [None]:
testId = list(test['id'])
Ypred = model.predict(dtest)
output = pd.DataFrame({'id': testId, 'price_doc': Ypred})
output.head()
output.to_csv('xgb1Sub.csv', index=False)