In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold

In [None]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [None]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [None]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [None]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    dbn_combos = list(product(sales.shop_id.unique(), sales.item_id.unique(), [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [None]:
len(all_combos)

In [None]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"y"})

In [None]:
#

In [None]:
training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['y'] = training['y'].clip(0,20)
training['y'] = training['y'].astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

In [None]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return cumsum/cumcnt

training['item_me'] = pd.to_numeric(get_mean_encoding(training, ['item_id'], 'y'), downcast='float')
training['shop_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id'], 'y'), downcast='float')
training['category_me'] = pd.to_numeric(get_mean_encoding(training, ['item_category_id'], 'y'), downcast='float')
training['shop_category_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_category_id'], 'y'), downcast='float')
training['shop_item_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_id'], 'y'), downcast='float')

training.fillna(0,inplace=True)

In [None]:
cols = ['item_id','date_block_num']
item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'item_block_units'})

training = training.merge(item_block_units, on=cols, how='left').fillna(0)
del item_block_units
gc.collect()
#
cols = ['shop_id','date_block_num']
shop_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_block_units'})

training = training.merge(shop_block_units, on=cols, how='left').fillna(0)
del shop_block_units
gc.collect()
#
cols = ['item_category_id','date_block_num']
cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'cat_block_units'})

training = training.merge(cat_block_units, on=cols, how='left').fillna(0)
del cat_block_units
gc.collect()
#

cols = ['shop_id', 'item_category_id','date_block_num']
shop_cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_cat_block_units'})

training = training.merge(shop_cat_block_units, on=cols, how='left').fillna(0)
del shop_cat_block_units
gc.collect()
#
cols = ['shop_id', 'item_id','date_block_num']
shop_item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_item_block_units'})

training = training.merge(shop_item_block_units, on=cols, how='left').fillna(0)
del shop_item_block_units
gc.collect()

In [None]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

In [212]:
training['item_units'] = training.groupby(['date_block_num'])['item_block_units'].transform(np.sum)
training['item_max_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.max)
training['item_min_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.min)
training['item_minmax_mean'] = training[['item_max_units_block', 'item_min_units_block']].mean(axis=1)

for q in [0.25,0.50,0.75]:
    name = 'item_minmax_q' + str(q)
    training[name] =  training[['item_min_units_block','item_max_units_block']].quantile(q,axis=1)


#
training['shop_units'] = training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum)
training['shop_max_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.max)
training['shop_min_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.min)
training['shop_minmax_mean'] = training[['shop_max_units_block', 'shop_min_units_block']].mean(axis=1)
for q in [0.25,0.50,0.75]:
    name = 'shop_minmax_q' + str(q)
    training[name] =  training[['shop_min_units_block','shop_max_units_block']].quantile(q,axis=1)

#
training['cat_units'] = training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum)
training['cat_max_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.max)
training['cat_min_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.min)
training['cat_minmax_mean'] = training[['cat_max_units_block', 'cat_min_units_block']].mean(axis=1)
#
training['shop_cat_units'] = training.groupby(['date_block_num'])['shop_cat_block_units'].transform(np.sum)
training['shop_cat_max_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.max)
training['shop_cat_min_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.min)
training['shop_cat_minmax_mean'] = training[['shop_cat_max_units_block', 'shop_cat_min_units_block']].mean(axis=1)
#
training['shop_item_units'] = training.groupby(['date_block_num'])['shop_item_block_units'].transform(np.sum)
training['shop_item_max_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.max)
training['shop_item_min_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.min)
training['shop_item_minmax_mean'] = training[['shop_item_max_units_block', 'shop_item_min_units_block']].mean(axis=1)

In [246]:
sales_train.groupby(['item_id','date_block_num'],as_index=False)['item_price'].mean().rename(columns={'item_price':'mean_price_block'})

Unnamed: 0,item_id,date_block_num,mean_price_block
0,0,20,58.000000
1,1,15,4490.000000
2,1,18,4490.000000
3,1,19,4490.000000
4,1,20,4490.000000
5,1,21,4490.000000
6,2,19,58.000000
7,2,22,58.000000
8,3,18,100.000000
9,3,19,58.000000


In [247]:
mean_price_block = sales_train.groupby(['item_id','date_block_num'],as_index=False)['item_price']\
    .mean().rename(columns={'item_price':'mean_price_block'})
training = training.merge(mean_price_block, on=['item_id','date_block_num'], how='left')
training['item_max_price'] = training['item_id'].map(sales_train.groupby(['item_id'])['item_price'].max())
training['item_min_price'] = training['item_id'].map(sales_train.groupby(['item_id'])['item_price'].min())



for q in [0.25,0.50,0.75]:
    name = 'item_price_minmax_q' + str(q)
    training[name] =  training[['item_min_price','item_max_price']].quantile(q,axis=1)

In [252]:
training['price_1'] = training['mean_price_block'] * 100 / training['item_max_price']

In [None]:
training['item_share_of_total_units'] = training['item_units'] * 100 / total_sales
training['category_share_of_total_units'] = training['cat_units'] * 100 / total_sales
training['shop_share_of_units'] = training['shop_units'] * 100 / total_sales
training['shop_item_share_of_total_units'] = training['shop_item_units'] * 100\
                        / total_sales
training['shop_item_share_of_shop_units'] = training['shop_item_units'] * 100\
                        / training['shop_units']

In [307]:
#training['item_share_of_total_shop_units'] 

shop_item_units = sales_train.groupby(['shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()\
        .rename(columns={'item_cnt_day':'shop_item_units'})
training = training.merge(shop_item_units, on=['shop_id', 'item_id'], how='left')
training['item_share_of_shop_units'] = training['shop_item_units_y'] * 100 / training['shop_units']

In [309]:
len(training)

6425094

In [None]:
training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['shop_item_units_comp'] = training['item_units'] / training['shop_units']

In [108]:
rolls = [2,3,6,12]
cols = ['item_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "item_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    item_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'item_block_units':roll_name_tmp})\
        [['item_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(item_block_units_rolling_temp, on=cols, how='left')
    del item_block_units_rolling_temp
    gc.collect()

    item_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(item_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del item_block_units_rolling
    gc.collect()

2
3
6
12


In [None]:
rolls = [3,6,12]
cols = ['shop_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        ['shop_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_block_units':roll_name_tmp})\
        [['shop_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_block_units_rolling_temp, on=cols, how='left')
    del shop_block_units_rolling_temp
    gc.collect()

    shop_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_block_units_rolling
    gc.collect()

In [None]:
rolls = [3,6,12]
cols = ['item_category_id', 'date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "cat_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    cat_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        ['cat_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'cat_block_units':roll_name_tmp})\
        [['item_category_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(cat_block_units_rolling_temp, on=cols, how='left')
    del cat_block_units_rolling_temp
    gc.collect()
    
    cat_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(cat_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del cat_block_units_rolling
    gc.collect()

In [None]:
rolls = [3,6,12]
cols = ['shop_id', 'item_category_id', 'date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_cat_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_cat_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        ['shop_cat_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_cat_block_units':roll_name_tmp})\
        [['shop_id', 'item_category_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_cat_block_units_rolling_temp, on=cols, how='left')
    del shop_cat_block_units_rolling_temp
    gc.collect()
    
    shop_cat_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_cat_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_cat_block_units_rolling
    gc.collect()

In [None]:
training.fillna(0, inplace=True)

In [None]:
lags = [1,2,3,6,12]

cols = ['item_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "item_block_units_lag_" + str(lag)
    item_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].shift(lag)\
        .rename(columns={'item_block_units':lag_name}).reset_index()

    training = training.merge(item_block_units_lag, on=cols, how='left')
    del item_block_units_lag
    gc.collect()

In [None]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_block_units_lag_" + str(lag)
    shop_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        ['shop_block_units'].shift(lag)\
        .rename(columns={'shop_block_units':lag_name}).reset_index()

    training = training.merge(shop_block_units_lag, on=cols, how='left')
    del shop_block_units_lag
    gc.collect()

In [None]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'item_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_item_block_units_lag_" + str(lag)
    shop_item_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_id'],as_index=False)\
        ['shop_item_block_units'].shift(lag)\
        .rename(columns={'shop_item_block_units':lag_name}).reset_index()

    training = training.merge(shop_item_block_units_lag, on=cols, how='left')
    del shop_item_block_units_lag
    gc.collect()

In [None]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'item_category_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_cat_block_units_lag_" + str(lag)
    shop_cat_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        ['shop_cat_block_units'].shift(lag)\
        .rename(columns={'shop_cat_block_units':lag_name}).reset_index()

    training = training.merge(shop_cat_block_units_lag, on=cols, how='left')
    del shop_cat_block_units_lag
    gc.collect()

In [None]:
lags = [1,2,3,6,12]

cols = ['item_category_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "cat_block_units_lag_" + str(lag)
    cat_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        ['cat_block_units'].shift(lag)\
        .rename(columns={'cat_block_units':lag_name}).reset_index()

    training = training.merge(cat_block_units_lag, on=cols, how='left')
    del cat_block_units_lag
    gc.collect()

In [None]:
str(int(0.25))

In [None]:
training['rolling_composite'] =  training['shop_block_units_rolling_3'].clip(1, None) *\
            training['item_block_units_rolling_3'].clip(1, None) 
training['me_composite'] =  training['item_me'].clip(1, None)  * training['shop_me'].clip(1, None) 

In [315]:
training['comp_A'] = training['shop_block_units_lag_1'] * training['item_share_of_shop_units']

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['comp_B'] = training['item_block_units_lag_1'] * training['item_share_of_shop_units']


In [260]:
def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

training['days_2_first_sale'] = training['item_id'].map(sales_train.set_index('item_id')\
    .apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1)\
    .groupby('item_id').min())
#transactions_items['item_days_since_start'] = transactions_items.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 





In [270]:
def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days




def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)



In [271]:
sales_train['item_days_since_start'] = sales_train\
    .apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1)
average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start']\
    .apply(list).apply(lambda x: get_average_days_between_sales(x))

training['item_mean_day_between_activity'] = training['item_id'].map(average_days_between_sales)


In [277]:
sub_cats = {}
for i in range(1,8):
    sub_cats[i] = "Accessories"
sub_cats[8] = "Tickets"
sub_cats[9] = "Delivery of goods"
for i in range(10,18):
    sub_cats[i] = "Consoles"
for i in range(18,25):
    sub_cats[i] = "Game for Consoles"
sub_cats[25] = "Accessories for Games"
sub_cats[26] = "Android Games"
sub_cats[27] = "MAC Games"
for i in range(28,32):
    sub_cats[i] = "PC Games"
for i in range(32,37):
    sub_cats[i] = "Payment Cards"
for i in range(37,40):
    sub_cats[i] = "Cinema - Blu-ray"
sub_cats[40] = "Cinema - DVD"
sub_cats[41] = "Cinema - Collectible"
for i in range(42,46):
    sub_cats[i] = "Audiobooks"
for i in range(46,55):
    sub_cats[i] = "Books"
for i in range(55,57):
    sub_cats[i] = "Music - CD"
sub_cats[57] = "Music - MP3"
sub_cats[58] = "Music - Vinyl"
sub_cats[59] = "Music - Music Video"
sub_cats[60] = "Music - Gift Edition"
for i in range(61,74):
    sub_cats[i] = "Gifts"
for i in range(73,79):
    sub_cats[i] = "Software"
sub_cats[79] = "Utility"
for i in range(80,84):
    sub_cats[i] = "Misc"
    
    
training['subcategory'] = training['item_category_id'].apply(lambda x: sub_cats[x]).astype('category')
training['subcategory_me'] = get_mean_encoding(training, ['subcategory'], 'y')


In [281]:
shop_areas = {}
for i in range(0,2):
    shop_areas[i] = "Yakutsk"
shop_areas[2] = "Adygea"
shop_areas[3] = "Balashikha"
shop_areas[4] = "Volga"
shop_areas[5] = "Vologda"
for i in range(6,9):
    shop_areas[i] = "Voronezh"
shop_areas[9] = "Outbound Trading"
for i in range(10,12):
    shop_areas[i] = "Zhukovsky"
shop_areas[12] = "Online store emergency"
for i in range(13,15):
    shop_areas[i] = "Kazan"
shop_areas[15] = "Kaluga"
shop_areas[16] = "Kolomna"
for i in range(17,19):
    shop_areas[i] = "Krasnoyarsk"
shop_areas[19] = "Kursk"
for i in range(20,33):
    shop_areas[i] = "Moscow"
shop_areas[33] = "Mytishchi"
for i in range(34,36):
    shop_areas[i] = "N.Novgorod"
for i in range(36,38):
    shop_areas[i] = "Novosibirsk"
shop_areas[38] = "Omsk"
for i in range(39,42):
    shop_areas[i] = "RostovNaDonu"
for i in range(42,44):
    shop_areas[i] = "St. Petersburg"
for i in range(44,46):
    shop_areas[i] = "Samara"
shop_areas[46] = "Sergiev Posad"
shop_areas[47] = "Surgut"
shop_areas[48] = "Tomsk"
for i in range(49,52):
    shop_areas[i] = "Tyumen TC"
for i in range(52,54):
    shop_areas[i] = "Ufa"
shop_areas[54] = "Khimki"
shop_areas[55] = "Digital warehouse"
shop_areas[56] = "Chekhov"
for i in range(57,59):
    shop_areas[i] = "Yakutsk"
shop_areas[59] = "Yaroslavl"

training['area'] = training['shop_id'].apply(lambda x: shop_areas[x]).astype('category')
training['area_me'] = get_mean_encoding(training, ['area'], 'y')


In [258]:
training.fillna(0, inplace=True)
for column in training.columns.values:
    if "units" in column and "share" not in column:
        training[column] = pd.to_numeric(training[column].astype(int), downcast='unsigned')
    else:
        training[column] = pd.to_numeric(training[column], downcast='float')


In [259]:
gc.collect()

54

In [None]:
training.info(memory_usage='deep')

In [None]:
training.dtypes

In [None]:
training.sample(10)

In [None]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','item_block_units', 'item_block_units_lag_1',\
                                                    'item_block_units_lag_2','item_block_units_lag_3',\
                                                    'item_block_units_lag_6','item_block_units_lag_12'
                                                   ]]
                                                    #'item_block_units_rolling_3', 'item_block_units_rolling_6']]
                                                    #'item_block_units_rolling_6']]

In [None]:
len(training[training['item_block_units'] > 0])

In [None]:
len(transactions_items_blocks)

In [None]:
gc.collect()
val = training[training['date_block_num'] == 33]
print("val length", len(val))

unique_pairs_val = list(set(list(zip(val.shop_id, val.item_id))))
print("number of unique shop/item pairs in val", len(unique_pairs_val))
unique_pairs_val_ignore = unique_pairs_val[0:int(len(unique_pairs_val)/2)]


def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

val_pairs_ignore_dict = {}
for t in unique_pairs_val_ignore:
    val_pairs_ignore_dict[tuple2key(t)] = 1
 
    
training['val_ignore'] = (training['shop_id'].astype(str) + '_' +  training['item_id'].astype(str))\
                                    .apply(lambda x: x in val_pairs_ignore_dict)

In [None]:
len(training[training['val_ignore'] == True])

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5).fit(training[cb_features].fillna(0))

In [None]:
training = pca.transform(training[cb_features].fillna(0))

In [None]:
training

In [316]:
gc.collect()

ZEROS_KEEP=3


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['y']


pos_train_len = len(y_train[y_train != 0])
print("pos_train_len", pos_train_len)

zeros_keep_indices_train = y_train[y_train == 0].sample(int(pos_train_len/ZEROS_KEEP)).index
print("zeros_keep_indices_train", len(zeros_keep_indices_train))
non_zeros_train_indices = y_train[y_train != 0].index
print("non_zeros_train_indices", len(non_zeros_train_indices))

train_indices = np.append(np.array(zeros_keep_indices_train), np.array(non_zeros_train_indices))

#y_train = y_train.loc[train_indices]
#x_train = x_train.loc[train_indices]




x_val = training[training['date_block_num'] == 33]
y_val = x_val['y']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]

pos_train_len 887869
zeros_keep_indices_train 295956
non_zeros_train_indices 887869
pos_val_len 31471
zeros_keep_indices_val 10490
non_zeros_val_indices 31471


In [254]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'item_block_units', 'shop_block_units',
       'cat_block_units', 'shop_cat_block_units', 'shop_item_block_units',
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_mean', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'shop_item_minmax_mean',
       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units', 'shop_item_share_of_total_units',
       'shop_item_share_of

In [322]:



features = [
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
    'item_units',#'item_share_of_total_units',
'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75',
    'item_block_units_rolling_2', 'item_block_units_rolling_12_x',
     'item_block_units_lag_1', 'shop_share_item_units_comp',
    'comp_A', 'comp_B'
]

In [323]:

gc.collect()
params =   {
    'objective' : 'reg:linear',
    'tree_method':'gpu_hist',
    #'gpu_id': 0,
    'learning_rate': 0.1, 
    #'gamma' : 0.3, 
    #'min_child_weight' : 3,
    #'nthread' : 16,
    #'max_depth' : 10,
    #'subsample' : 0.9, 
    #'colsample_bytree' : 0.5, 
    'seed':42, 
    'eval_metric' : "rmse",
    'num_boost_round' : 70000,
    #'n_estimators':999,
    #'max_leaves': 300
}


tr_data = xgb.DMatrix(x_train[features], y_train)
va_data = xgb.DMatrix(x_val[features], y_val)


watchlist = [(tr_data, 'train'), (va_data, 'valid')]

xg_model = xgb.train(params, tr_data, 5000, watchlist, maximize=False, early_stopping_rounds = 50, verbose_eval=10)

[0]	train-rmse:1.13971	valid-rmse:2.43454
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.81045	valid-rmse:1.87757
[20]	train-rmse:0.734634	valid-rmse:1.72176
[30]	train-rmse:0.712105	valid-rmse:1.65274
[40]	train-rmse:0.7023	valid-rmse:1.6194
[50]	train-rmse:0.695697	valid-rmse:1.61029
[60]	train-rmse:0.690065	valid-rmse:1.59809
[70]	train-rmse:0.685076	valid-rmse:1.58817
[80]	train-rmse:0.679994	valid-rmse:1.58244
[90]	train-rmse:0.675213	valid-rmse:1.57476
[100]	train-rmse:0.671832	valid-rmse:1.56309
[110]	train-rmse:0.668755	valid-rmse:1.55712
[120]	train-rmse:0.665385	valid-rmse:1.55317
[130]	train-rmse:0.662641	valid-rmse:1.55046
[140]	train-rmse:0.660215	valid-rmse:1.5467
[150]	train-rmse:0.657838	valid-rmse:1.54332
[160]	train-rmse:0.655633	valid-rmse:1.54275
[170]	train-rmse:0.653544	valid-rmse:1.53846
[180]	train-rmse:0.651535	valid-rmse:1.53741
[190]	train-rmse:

In [137]:
item_features = ['item_me', 
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_share_of_total_units',
     'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75',
    ]

merge_col = ['item_id']

test = test.merge(training.drop_duplicates('item_id')[item_features]+merge_colge_col, on=merge_col, how='left')

NameError: name 'merge_colge_col' is not defined

In [None]:
shop_features = [
         'shop_me', 
        'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 
       'shop_share_of_units',
]

merge_col = ['shop_id']

test = test.merge(training.drop_duplicates(merge_col)[shop_features]+merge_col, on=merge_col, how='left')

In [None]:
cat_features = [
             'category_me','cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'category_share_of_total_units',
]

merge_col = ['item_category_id']

test = test.merge(training.drop_duplicates(merge_col)[cat_features]+merge_col, on=merge_col, how='left')

In [None]:
shop_cat_features = [
           'shop_category_me',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
    'shop_cat_1', 'shop_cat_2', 'shop_cat_3',
       'shop_cat_4', 'shop_cat_5'
]

merge_col = ['shop_id', 'item_category_id']

test = test.merge(training.drop_duplicates(merge_col)[shop_cat_features]+merge_col, on=merge_col, how='left')

In [None]:
shop_item_features = [
              
'shop_item_1', 'shop_item_2',
       'shop_item_3', 'shop_item_4', 'shop_item_5'
]

merge_col = ['shop_id', 'item_id']

test = test.merge(training.drop_duplicates(merge_col)[shop_item_features]+merge_col, on=merge_col, how='left')

In [80]:
cb_model = CatBoostRegressor(iterations=70000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                             od_type = "Iter",
                             od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[cb_features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[cb_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

NameError: name 'cb_features' is not defined

In [None]:
#training.to_pickle("pickled/training")
#training = pd.read_pickle("pickled/training")

#pickle.dump(cb_model, open( "pickled/cb_model", "wb"), protocol=4)

#cb_model = pickle.load( open( "pickled/cb_model", "rb" ) )

In [None]:
scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[cb_features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

In [None]:
cb_features = [item[0] for item in scores.items() if item[1] > 4]

In [78]:

gc.collect()
params =   {
    'objective' : 'reg:linear',
    'tree_method':'gpu_hist',
    #'gpu_id': 0,
    'learning_rate': 0.1, 
    #'gamma' : 0.3, 
    #'min_child_weight' : 3,
    #'nthread' : 16,
    #'max_depth' : 30,
    #'subsample' : 0.9, 
    #'colsample_bytree' : 0.5, 
    'seed':42, 
    'eval_metric' : "rmse",
    'num_boost_round' : 70000,
    #'n_estimators':999,
    #'max_leaves': 300
}


tr_data = xgb.DMatrix(x_train[features], y_train)
va_data = xgb.DMatrix(x_val[features], y_val)


watchlist = [(tr_data, 'train'), (va_data, 'valid')]

xg_model = xgb.train(params, tr_data, 5000, watchlist, maximize=False, early_stopping_rounds = 50, verbose_eval=True)

[0]	train-rmse:1.16166	valid-rmse:2.46821
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1]	train-rmse:1.12306	valid-rmse:2.39943
[2]	train-rmse:1.09049	valid-rmse:2.34348
[3]	train-rmse:1.06251	valid-rmse:2.28895
[4]	train-rmse:1.03855	valid-rmse:2.23949
[5]	train-rmse:1.01832	valid-rmse:2.20022
[6]	train-rmse:1.00086	valid-rmse:2.16756
[7]	train-rmse:0.986263	valid-rmse:2.13893
[8]	train-rmse:0.973513	valid-rmse:2.11018
[9]	train-rmse:0.962906	valid-rmse:2.09097
[10]	train-rmse:0.95376	valid-rmse:2.07079
[11]	train-rmse:0.945948	valid-rmse:2.05279
[12]	train-rmse:0.938977	valid-rmse:2.03648
[13]	train-rmse:0.932779	valid-rmse:2.02253
[14]	train-rmse:0.927522	valid-rmse:2.01
[15]	train-rmse:0.922954	valid-rmse:2.00099
[16]	train-rmse:0.919312	valid-rmse:1.99106
[17]	train-rmse:0.915796	valid-rmse:1.98382
[18]	train-rmse:0.91287	valid-rmse:1.97739
[19]	train-rmse:0.910199	valid-rmse:1.972

In [319]:
scores = xg_model.get_score(importance_type='gain')


sorted(scores.items(), key=lambda x: x[1])[::-1]

[('comp_B', 3893.307992959633),
 ('comp_A', 1480.827009120514),
 ('item_minmax_q0.75', 1205.196931300417),
 ('item_minmax_q0.5', 1180.6257736183404),
 ('item_block_units_lag_1', 907.2783703809503),
 ('item_block_units_rolling_12_x', 901.8569282721875),
 ('shop_share_item_units_comp', 736.9385751275056),
 ('item_minmax_q0.25', 307.23175405954134),
 ('shop_category_me', 288.04938141415823),
 ('item_units', 283.0780940970433),
 ('category_me', 250.62374796469726),
 ('item_me', 194.19629776902343),
 ('shop_me', 172.93935914930046),
 ('item_block_units_rolling_2', 139.27189915155606)]