In [63]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [64]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [65]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [66]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [67]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [68]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [69]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [70]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [71]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [72]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [92]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [73]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
25818,10325,33,0.318182
1639,838,14,0.125
28788,11272,32,0.069767
20891,8094,29,0.162791
52917,20736,30,0.255814
48300,18715,26,0.086957
46258,18100,26,0.065217
1847,972,26,1.347826
52128,20416,32,0.162791
14264,5336,31,1.214286


In [74]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [75]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 54
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [76]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


training.fillna(0,inplace=True)

fold 1
fold 2
fold 3
fold 4
fold 5


In [77]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [78]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
shop_block_units 3
shop_block_mean 3
cat_block_units 3
cat_block_mean 3
shop_cat_block_units 3
shop_cat_block_mean 3


In [79]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [80]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

item_block_units 1
item_block_mean 1
shop_block_units 1
shop_block_mean 1
cat_block_units 1
cat_block_mean 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_item_block_units 1
shop_item_block_mean 1


In [81]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [82]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'shop_item_share_of_shop_units_mean', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_max_units_block', 'item_min_units_block',
       'item_minmax_q0.25', 'item_minmax_q0.5', 'item_minmax_q0.75',
       'shop_max_units_block', 'shop_min_units_block',
       'shop_minmax_q0

In [83]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_item_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_block_units_lag_comp1,item_block_units_lag_comp1
2416946,20335,45,29,0,72,6,17,0.395349,533,0.169799,1147,0.264103,4,0.039604,0,0,2177047,378130476,158924431,104.391045,1655.887817,1442.138672,50629,2.427699,0.031857,0.031857,0.041616,0.307388,0.259554,0.428994,0.439548,0.375997,36,1,9.75,18.5,27.25,1114,485,642.25,799.5,956.75,3137,931,1482.5,2034.0,2585.5,8793732,30,0,7.5,15.0,22.5,1,0,0.25,0.5,0.75,8.666667,0.19062,584.333313,0.1966,1113.333374,0.286286,12.666667,0.150409,0.0,10,0.227273,605,0.198948,1010,0.252248,8,0.087912,0,0.0,19.273653,0.318573
923048,7781,50,21,0,31,10,9,0.173077,813,0.352865,353,0.062279,0,0.0,0,0,3112980,382022004,137928960,149.26973,1842.489502,435.390015,59865,2.870572,0.043403,0.043403,0.036321,0.102625,0.384732,0.076184,0.399399,0.49711,16,1,4.75,8.5,12.25,1554,665,887.25,1109.5,1331.75,675,171,297.0,423.0,549.0,7346577,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,879.333313,0.436843,301.666656,0.068214,0.0,0.0,0.0,16,0.32,805,0.379538,340,0.074725,0,0.0,0,0.0,34.939236,0.694444
882197,7159,58,25,0,55,2,16,0.340426,1683,0.611778,6820,0.294932,120,0.243902,0,0,2779674,396919512,162699642,133.28746,497.367035,1623.162842,59142,2.835903,0.03635,0.03635,0.034033,0.466102,0.558642,0.3145,0.513107,0.457371,111,2,29.25,56.5,83.75,3219,955,1521.0,2087.0,2653.0,11613,4757,6471.0,8185.0,9899.0,8445096,244,99,135.25,171.5,207.75,1,0,0.25,0.5,0.75,63.333332,1.266667,2135.333252,0.803508,9203.666992,0.383218,166.666672,0.345794,0.333333,19,0.38,1760,0.648728,7942,0.325492,145,0.297131,0,0.0,63.976734,0.690658
1960486,16122,27,16,1,64,5,11,0.22449,2132,1.221077,787,0.391737,31,0.756098,1,1,2173738,241470089,77456052,104.232376,1281.326782,1654.606812,44362,2.127191,0.057274,0.057274,0.043519,0.280385,0.958314,0.38458,0.426691,0.516015,32,5,11.75,18.5,25.25,4617,0,1154.25,2308.5,3462.75,2733,763,1255.5,1748.0,2240.5,4927961,149,0,37.25,74.5,111.75,4,0,1.0,2.0,3.0,14.333333,0.301298,1695.666626,1.042771,782.333313,0.443025,31.333334,0.84258,1.0,13,0.265306,1490,0.875955,763,0.379791,30,0.731707,1,1.0,85.337914,0.744559
2041364,16730,34,24,0,37,1,23,0.46,368,0.135643,2493,0.273956,7,0.038462,0,0,3586650,490180900,194611629,171.982574,850.382507,1093.897461,71733,3.439651,0.03686,0.03686,0.040363,0.454545,0.113049,0.27821,0.564638,0.526413,132,1,33.75,66.5,99.25,481,1,121.0,241.0,361.0,3796,1376,1981.0,2586.0,3191.0,9803618,39,0,9.75,19.5,29.25,1,0,0.25,0.5,0.75,11.0,0.218205,343.0,0.135041,2552.666748,0.298666,11.333333,0.066954,0.333333,21,0.42,481,0.177033,3134,0.362312,17,0.098266,0,0.0,17.72945,0.774051
2380064,19908,3,20,0,55,9,3,0.06,565,0.266384,6443,0.298979,42,0.097448,0,0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.043519,0.138955,0.225069,0.3145,0.395493,0.472199,13,2,4.75,7.5,10.25,933,343,490.5,638.0,785.5,11613,4757,6471.0,8185.0,9899.0,6012075,119,37,57.5,78.0,98.5,1,0,0.25,0.5,0.75,7.333333,0.147782,525.666687,0.273074,7977.0,0.395925,74.0,0.182962,0.0,5,0.098039,615,0.30597,8409,0.396352,98,0.235577,0,0.0,28.995756,0.235738
2585378,22139,57,19,0,38,8,15,0.294118,1974,0.98209,799,0.261111,26,0.433333,0,0,2909805,378932448,114680550,139.527344,1694.342896,1380.080444,57055,2.73583,0.049751,0.049751,0.043519,0.187576,0.818999,0.225859,0.458961,0.556394,22,1,6.25,11.5,16.75,4070,1645,2251.25,2857.5,3463.75,1891,374,753.25,1132.5,1511.75,7430048,60,9,21.75,34.5,47.25,1,0,0.25,0.5,0.75,13.666667,0.277415,1706.333374,0.930946,415.666656,0.149479,18.333334,0.323397,0.666667,11,0.22,1807,0.937727,475,0.172727,29,0.527273,1,1.0,89.900497,0.547264
1259346,10653,54,18,0,67,7,4,0.08,2373,1.231448,556,0.191724,47,0.810345,0,0,2371450,309332150,91395683,113.712814,416.425415,263.553497,47429,2.274256,0.051894,0.051894,0.03835,0.064695,1.228666,0.240528,0.416787,0.498956,10,1,3.25,5.5,7.75,4457,571,1542.5,2514.0,3485.5,1694,346,683.0,1020.0,1357.0,6186643,141,15,46.5,78.0,109.5,2,0,0.5,1.0,1.5,0.0,0.0,2305.333252,1.308865,662.666687,0.274565,49.666668,1.007257,0.0,4,0.081633,2569,1.40613,610,0.230537,46,0.851852,1,1.0,133.31604,0.207577
1621105,13635,6,33,1,69,10,18,0.409091,1608,0.402907,834,0.225649,19,0.22619,1,1,2573868,530591380,233461527,123.418907,728.626282,897.310242,58497,2.804975,0.025056,0.025056,0.025056,0.297297,0.571107,0.330143,0.399399,0.332587,18,18,18.0,18.0,18.0,2698,1164,1547.5,1931.0,2314.5,1237,104,387.25,670.5,953.75,12058895,41,0,10.25,20.5,30.75,1,1,1.0,1.0,1.0,0.0,0.0,1381.333374,0.397939,816.666687,0.286008,16.333334,0.243571,0.0,0,0.0,1484,0.408478,593,0.199865,13,0.188406,0,0.0,37.183662,0.0
2176135,18209,4,23,0,57,12,7,0.14,1438,0.52926,656,0.144176,0,0.0,0,0,5289500,692500650,287431430,253.635513,254.421066,1425.738525,105790,5.07271,0.036805,0.036805,0.036109,0.09854,0.292574,0.112434,0.779338,0.779338,13,1,4.0,7.0,10.0,1438,522,751.0,980.0,1209.0,656,302,390.5,479.0,567.5,13850013,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,4.333333,0.085128,773.666687,0.334986,458.666656,0.102082,0.0,0.0,0.0,2,0.04,906,0.366357,446,0.098022,0,0.0,0,0.0,33.3456,0.073611


In [95]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'shop_item_share_of_shop_units_mean', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_max_units_block', 'item_min_units_block',
       'item_minmax_q0.25', 'item_minmax_q0.5', 'item_minmax_q0.75',
       'shop_max_units_block', 'shop_min_units_block',
       'shop_minmax_q0

In [93]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [86]:
features = [
    
    'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'month_me', 'block_me', 'item_max_units_block',
       'item_min_units_block', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_q0.25', 'shop_minmax_q0.5',
       'shop_minmax_q0.75', 'cat_max_units_block', 'cat_min_units_block',
       'cat_minmax_q0.25', 'cat_minmax_q0.5', 'cat_minmax_q0.75',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_q0.25',
       'shop_cat_minmax_q0.5', 'shop_cat_minmax_q0.75',
       'shop_item_max_units_block', 'shop_item_min_units_block',
       'shop_item_minmax_q0.25', 'shop_item_minmax_q0.5',
       'shop_item_minmax_q0.75', 'item_block_units_rolling_3',
       'item_block_mean_rolling_3', 'shop_block_units_rolling_3',
       'shop_block_mean_rolling_3', 'cat_block_units_rolling_3',
       'cat_block_mean_rolling_3', 'shop_cat_block_units_rolling_3',
       'shop_cat_block_mean_rolling_3', 'shop_item_block_mean_rolling_3',
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'shop_block_units_lag_comp1', 'item_block_units_lag_comp1',
    'shop_item_share_of_shop_units_mean'


]

In [117]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
           'shop_cat_block_mean_rolling_3',



      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
            'shop_cat_block_mean_lag_1',
    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',
    'shop_item_block_mean_lag_1',
    
#'item_id_mean_encoding',
       #'shop_id_mean_encoding',
    'item_category_id_mean_encoding',  
    #'month_mean_encoding', 'date_block_num_mean_encoding'

]




In [118]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 1,
                             bagging_temperature = 30,
                             cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6489929	test: 1.3255770	best: 1.3255770 (0)	total: 131ms	remaining: 2m 10s
1:	learn: 1.6358324	test: 1.3165157	best: 1.3165157 (1)	total: 229ms	remaining: 1m 54s
2:	learn: 1.6212915	test: 1.3073018	best: 1.3073018 (2)	total: 318ms	remaining: 1m 45s
3:	learn: 1.6066127	test: 1.2981139	best: 1.2981139 (3)	total: 400ms	remaining: 1m 39s
4:	learn: 1.5929633	test: 1.2892429	best: 1.2892429 (4)	total: 463ms	remaining: 1m 32s
5:	learn: 1.5804808	test: 1.2811477	best: 1.2811477 (5)	total: 554ms	remaining: 1m 31s
6:	learn: 1.5668714	test: 1.2725177	best: 1.2725177 (6)	total: 656ms	remaining: 1m 33s
7:	learn: 1.5562899	test: 1.2661230	best: 1.2661230 (7)	total: 730ms	remaining: 1m 30s
8:	learn: 1.5452011	test: 1.2612956	best: 1.2612956 (8)	total: 810ms	remaining: 1m 29s
9:	learn: 1.5326998	test: 1.2537909	best: 1.2537909 (9)	total: 897ms	remaining: 1m 28s
10:	learn: 1.5225935	test: 1.2485987	best: 1.2485987 (10)	total: 1.02s	remaining: 1m 31s
11:	learn: 1.5115584	test: 1.2403208	best

95:	learn: 1.2422504	test: 1.0841383	best: 1.0841383 (95)	total: 8.59s	remaining: 1m 20s
96:	learn: 1.2413075	test: 1.0836275	best: 1.0836275 (96)	total: 8.67s	remaining: 1m 20s
97:	learn: 1.2408516	test: 1.0831979	best: 1.0831979 (97)	total: 8.73s	remaining: 1m 20s
98:	learn: 1.2404391	test: 1.0830890	best: 1.0830890 (98)	total: 8.82s	remaining: 1m 20s
99:	learn: 1.2395378	test: 1.0827974	best: 1.0827974 (99)	total: 8.89s	remaining: 1m 20s
100:	learn: 1.2392490	test: 1.0826679	best: 1.0826679 (100)	total: 8.99s	remaining: 1m 20s
101:	learn: 1.2384206	test: 1.0821185	best: 1.0821185 (101)	total: 9.1s	remaining: 1m 20s
102:	learn: 1.2372739	test: 1.0813764	best: 1.0813764 (102)	total: 9.21s	remaining: 1m 20s
103:	learn: 1.2355550	test: 1.0801477	best: 1.0801477 (103)	total: 9.29s	remaining: 1m 20s
104:	learn: 1.2341604	test: 1.0792286	best: 1.0792286 (104)	total: 9.44s	remaining: 1m 20s
105:	learn: 1.2337002	test: 1.0790760	best: 1.0790760 (105)	total: 9.54s	remaining: 1m 20s
106:	learn

186:	learn: 1.1952306	test: 1.0560316	best: 1.0560316 (186)	total: 16.4s	remaining: 1m 11s
187:	learn: 1.1950731	test: 1.0560501	best: 1.0560316 (186)	total: 16.4s	remaining: 1m 11s
bestTest = 1.056031568
bestIteration = 186
Shrink model to first 187 iterations.


[('item_block_mean_lag_1', 23.161935713285793),
 ('shop_item_block_mean_lag_1', 22.28052739140328),
 ('shop_item_block_mean_rolling_3', 10.472826614801972),
 ('item_block_mean_rolling_3', 7.558728394492474),
 ('shop_cat_block_mean_lag_1', 7.14785682413922),
 ('shop_cat_block_mean_rolling_3', 6.817373206720902),
 ('item_category_id_mean_encoding', 6.216066287911568),
 ('item_category_id', 4.960865561426438),
 ('shop_block_mean_lag_1', 4.389681532787209),
 ('shop_item_share_of_shop_units_mean', 4.012213756607869),
 ('shop_block_mean_rolling_3', 2.9819247164232574)]

In [None]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [98]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [99]:
item_features = [ 
    'shop_item_share_of_shop_units_mean','item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [108]:
shop_features = [
        'shop_id_mean_encoding'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [120]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [102]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]


In [103]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [104]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [105]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [121]:
test.fillna(0, inplace=True)

In [None]:
test.sample(10)

In [122]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.094729  , 0.07426158, 0.19876491, ..., 0.10729751, 0.13896821,
       0.10729751])

In [123]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.3577849171460316
12.209351871380932


In [None]:
cb_preds[0:100]

In [124]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)

