In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

random_state=7

%matplotlib inline
pd.set_option('display.max_columns', 500)

# Reading Data

In [None]:
train=pd.read_csv('input/train.csv', parse_dates=['timestamp'])
test=pd.read_csv('input/test.csv', parse_dates=['timestamp'])
macro=pd.read_csv('input/macro.csv', parse_dates=['timestamp'])

test_id=test['id']

print('The shape of training data is', train.shape)
print('The shape of test data is', test.shape)
print('The shape of macro data is', macro.shape)

#number of training and test examples
n_train=train.shape[0]
n_test=test.shape[0]


#fts contains the feature names (exclude id and year)
fts=list(train.columns[1:-1])
target=train['price_doc']

plt.hist(np.log(1+target))
target=np.log(1+target)

# Combine train and test and Change Categorical Data to Dummies

In [None]:
#concatenate training and test
raw_data=pd.concat([train.loc[:,fts[0]:fts[-1]], test.loc[:, fts[0]:fts[-1]]])
print('shape of train plus test', raw_data.shape)

macro_dtypes=macro.dtypes
print(macro_dtypes[macro_dtypes=='object'])

In [None]:
print('child_on_acc_pre_school',macro['child_on_acc_pre_school'].unique())
print('modern_education_share', macro['modern_education_share'].unique())
print('old_education_build_share', macro['old_education_build_share'].unique())
# all these values doesn't make sense to me. drop them before doing PCA

print('shape of macro data frame', macro.shape)
macro=macro.drop(['child_on_acc_pre_school', 'modern_education_share',
                 'old_education_build_share'], axis=1)
print('shpe of macro afeter dropping', macro.shape)

In [None]:
time_stamp=macro['timestamp']
macro=macro.drop('timestamp', axis=1)
print('shape of macro', macro.shape)

print('number of nas in macro', macro.isnull().sum().sum())
macro=macro.fillna(macro.median())
print('number of nas in macro', macro.isnull().sum().sum())

ss=StandardScaler()
macro_std=ss.fit_transform(macro)

In [None]:
# do PCA on macro data
from sklearn.decomposition import PCA
n_components=30
pca=PCA(n_components=n_components)
macro_pca=pca.fit_transform(macro_std)
print('explained_variance_ratio', pca.explained_variance_ratio_.sum())

In [None]:
macro_pca=pd.DataFrame(macro_pca)
macro_pca['timestamp']=time_stamp
macro_pca.head()

In [None]:
#join the macro data
raw_data=pd.merge(raw_data, macro_pca, how='left', on='timestamp')
print('shape of the merged data', raw_data.shape)

In [None]:
data=pd.get_dummies(raw_data)
print('shape after get_dummies', data.shape)

#change timestamp to year
data['timestamp']=data['timestamp'].dt.year.astype(int)

# Fill NaN with median values

In [None]:
#when using get_dummies the Nan in categorical data are ignored. The possible Nan are numbers. fill them with mean
print('number of NaN in train and test', data.isnull().sum().sum())

data=data.fillna(data.median())

print('number of NaN in train and test', data.isnull().sum().sum())

# Try standardscaler before fitting

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler(with_mean=False, with_std=True)
data_std=ss.fit_transform(data)
data_std=pd.DataFrame(data_std)

train_std=data_std.iloc[:n_train, :]
test_std=data_std.iloc[n_train :, :]

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

clf1=RandomForestRegressor(n_estimators=500, random_state=random_state)
clf1.fit(train_std, target)

In [None]:
pre_rf=clf1.predict(test_std)
pre_rf=np.exp(pre_rf)

test_id=np.array(test_id)
submission=pd.DataFrame({'id': test_id, 'price_doc': pre_rf})
submission.to_csv(path_or_buf='170517submission_randomforest_with_macro_data.csv',index=False)

In [None]:
submission.head()

# XGBOOST

In [None]:
import xgboost as xgb

In [None]:
from sklearn.model_selection import train_test_split
train_part, val_train, target_part, val_target=train_test_split(train_std, target, test_size=0.2, 
                                                              random_state=random_state)
print('shape of train_part', train_part.shape)
print('shape of target_part', target_part.shape)
print('shape of val_train', val_train.shape)
print('shape of val_target', val_target.shape)

In [None]:
dtrain=xgb.DMatrix(train_part, target_part)
dval=xgb.DMatrix(val_train, val_target)
dtest=xgb.DMatrix(test_std)

xgb_params = {
    'eta': 0.5,
    'max_depth': 6,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}

model=xgb.train(xgb_params, dtrain, num_boost_round=1300, 
                evals=[(dval, 'validation')], early_stopping_rounds=20,
               verbose_eval=20)
num_boost_round=model.best_iteration

In [None]:
Ddata_std=xgb.DMatrix(train_std, target)

In [None]:
#use xgb.cv to do cross validation
#xgb.cv(xgb_params, Ddata_std, num_boost_round=20, 
#       nfold=5, verbose_eval=5)

In [None]:
full_model=xgb.train(xgb_params, Ddata_std, num_boost_round=num_boost_round)

In [None]:
pre_xgb=full_model.predict(dtest)

plt.hist(pre_xgb)

In [None]:
pre_xgb=np.exp(pre_xgb)

submission=pd.DataFrame({'id': test_id, 'price_doc': pre_xgb})
submission.to_csv(path_or_buf='170518submission_xgb_with_30PCA_macr0.csv',index=False)
submission.head()