In [None]:
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build",'usdrub']
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
macro_df=pd.read_csv('../input/macro.csv',usecols=['timestamp'] + macro_cols)
train_df.shape

In [None]:
train_df = pd.merge_ordered(train_df, macro_df, on='timestamp', how='left')
train_df.head()

In [None]:
train_df['month']=train_df['timestamp'].map(lambda x:(int(x[0:4])-2011)*12+int(x[5:7])-8)

In [None]:
train_df.head()

In [None]:
test_df[(test_df.full_sq < 200) & (test_df.full_sq>1)].describe()
test_df.describe(include=['O'])
train_df.groupby(['year'])['price_doc'].mean()

In [None]:
train_df[(train_df.life_sq == 19)&(train_df.sub_area=='Hovrino')][['full_sq','life_sq','sub_area','price_doc']].head(50)
train_df[(train_df.life_sq == 19)&(train_df.sub_area=='Hovrino')].head(20)

In [None]:
X_train = train_df[(train_df.full_sq < 200)&(train_df.num_room>0)][['full_sq','sport_objects_raion','num_room','kitch_sq']]
Y_train = np.log1p(train_df[(train_df.full_sq < 200)&(train_df.num_room>0)]['price_doc'].values.reshape(-1,1))

In [None]:
train_df_numeric = train_df.select_dtypes(exclude=['object'])
train_df_obj = train_df.select_dtypes(include=['object']).copy()

for column in train_df_obj:
    train_df_obj[column] = pd.factorize(train_df_obj[column])[0]

train_df_values = pd.concat([train_df_numeric, train_df_obj], axis=1)[:25000]
test_df_values = pd.concat([train_df_numeric, train_df_obj], axis=1)[25001:]

In [None]:
X_train = train_df.select_dtypes(exclude=['object'])
Y_train = np.log1p(train_df['price_doc'].values.reshape(-1,1))
X_train.shape

In [None]:
X_train = train_df_values.drop(['price_doc','id','timestamp'],axis=1)
Y_train = np.log1p(train_df_values['price_doc'].values.reshape(-1,1))
X_train.shape

In [None]:
X_test = test_df_values.drop(['price_doc','id','timestamp'],axis=1)
Y_test = np.log1p(test_df_values['price_doc'].values.reshape(-1,1))
X_test.shape

In [None]:
dtrain = xgb.DMatrix(X_train, Y_train)
dtest = xgb.DMatrix(X_test, Y_test)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params,dtrain, num_boost_round=200, evals=[(dtest, 'val')],
                       early_stopping_rounds=30, verbose_eval=10)

num_boost_round = model.best_iteration

Y_pred=model.predict(dtest)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(model, max_num_features=40, height=0.5, ax=ax)

In [None]:
logY_train=np.log1p(Y_train)
logY_pred=np.log1p(Y_pred)
np.sqrt(mean_squared_error(Y_pred,Y_test))