In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

# ========================
# Get the data
# ========================
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

df_train.head()

# ========================
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
# ========================
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# ========================
# Build df_all = (df_train+df_test).join(df_macro)
# ========================
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

In [None]:
# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year'] = month_year
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)
#df_all['month_year_cnt_growth'] = month_year.map(month_year_cnt_map) / 
import matplotlib.pyplot as plt
plt.plot(month_year.map(month_year_cnt_map))
plt.show()
print(type(month_year.map(month_year_cnt_map)))


In [None]:
month_year.map(month_year_cnt_map)

In [None]:
month_year.map(month_year_cnt_map).shift(5)

In [None]:
df_all[['timestamp','month_year','month_year_cnt']].sample(5)

In [None]:
df_all[df_all['month_year']==201402].shape