In [23]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [24]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [25]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [26]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [27]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [28]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [29]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [30]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [31]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
52458,20607,22,1.86
26872,10642,18,0.14
39752,15447,29,0.116279
49212,19127,20,0.04
5289,2423,32,2.465116
15935,5995,33,0.022727
36235,14222,29,0.069767
40910,15897,30,0.27907
5102,2317,27,0.06383
31069,12363,23,0.66


In [32]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [33]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [34]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return pd.to_numeric(cumsum/cumcnt, downcast='float')

training['item_me'] = get_mean_encoding(training, ['item_id'], 'item_cnt_block')
training['shop_me'] = get_mean_encoding(training, ['shop_id'], 'item_cnt_block')
training['category_me'] = get_mean_encoding(training, ['item_category_id'], 'item_cnt_block')
training['shop_category_me'] = get_mean_encoding(training, ['shop_id', 'item_category_id'], 'item_cnt_block')
training['shop_item_me'] = get_mean_encoding(training, ['shop_id', 'item_id'], 'item_cnt_block')
training['month_me'] = get_mean_encoding(training, ['month'], 'item_cnt_block')
training['block_me'] = get_mean_encoding(training, ['date_block_num'], 'item_cnt_block')



training.fillna(0,inplace=True)

In [35]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [36]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
shop_block_units 3
shop_block_mean 3
cat_block_units 3
cat_block_mean 3
shop_cat_block_units 3
shop_cat_block_mean 3


In [37]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [38]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

item_block_units 1
item_block_mean 1
shop_block_units 1
shop_block_mean 1
cat_block_units 1
cat_block_mean 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_item_block_units 1
shop_item_block_mean 1


In [39]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [40]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'shop_item_share_of_shop_units_mean', 'item_me', 'shop_me',
       'category_me', 'shop_category_me', 'shop_item_me', 'month_me',
       'block_me', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_q0.25', 'item_minmax_q0.5', 'item_minmax_q0.75',
       'shop_max_units_block', 'shop_min_units_block',
       'shop_minmax_q0.25', 'shop_minmax_q0.5', 'shop_minmax_q0.75',
  

In [41]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_me,shop_me,category_me,shop_category_me,shop_item_me,month_me,block_me,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_item_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_block_units_lag_comp1,item_block_units_lag_comp1
584204,4694,59,13,0,57,2,4,0.086957,656,0.417568,457,0.139927,0,0.0,0,0,2049254,175652748,69986479,98.263275,184.804901,1296.435181,44549,2.136158,0.063654,0.063654,0.043519,0.102161,0.438426,0.096746,0.0,0.0,0.674299,0.888021,10,2,4.0,6.0,8.0,1402,519,739.75,960.5,1181.25,656,302,390.5,479.0,567.5,3818538,3,0,0.75,1.5,2.25,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.130435,640,0.419397,473,0.144825,1,0.014085,0,0.0,40.73838,0.381922
1716943,14386,26,19,0,40,8,1,0.019608,868,0.431841,5971,0.460939,90,0.354331,0,0,2909805,378932448,114680550,139.527344,1694.342896,1380.080444,57055,2.73583,0.049751,0.049751,0.042838,0.111111,0.447093,0.401876,0.376605,0.0,0.496476,0.596879,23,1,6.5,12.0,17.5,1984,616,958.0,1300.0,1642.0,9071,3834,5143.25,6452.5,7761.75,7430048,179,78,103.25,128.5,153.75,1,0,0.25,0.5,0.75,1.666667,0.033605,789.0,0.431341,4720.0,0.451115,96.666664,0.455797,0.0,3,0.06,757,0.392839,4800,0.393443,97,0.397541,0,0.0,37.661694,0.149254
1829101,15287,26,17,0,63,6,86,1.755102,824,0.451013,735,0.405405,4,0.108108,0,0,2331175,276254895,86919525,111.781593,889.815247,48.918354,47575,2.281257,0.054735,0.054735,0.043519,1.02963,0.444487,0.558809,0.279197,1.4,0.474436,0.589499,137,13,44.0,75.0,106.0,1984,616,958.0,1300.0,1642.0,3069,728,1313.25,1898.5,2483.75,5637855,44,4,14.0,24.0,34.0,4,0,1.0,2.0,3.0,105.333336,2.169076,761.333313,0.45214,854.0,0.488108,11.333333,0.315723,1.666667,92,1.877551,786,0.450172,785,0.457726,11,0.314286,1,1.0,43.021347,5.035578
680975,5467,24,17,0,56,6,2,0.040816,941,0.515052,68,0.138776,0,0.0,0,0,2331175,276254895,86919525,111.781593,889.815247,48.918354,47575,2.281257,0.054735,0.054735,0.044646,0.060423,0.697377,0.262281,0.241611,0.4,0.55247,0.675763,8,1,2.75,4.5,6.25,1971,794,1088.25,1382.5,1676.75,736,33,208.75,384.5,560.25,5637855,13,0,3.25,6.5,9.75,2,0,0.5,1.0,1.5,3.0,0.061366,893.0,0.530502,47.666668,0.122489,2.0,0.271429,0.666667,3,0.061224,836,0.478809,60,0.122449,1,0.1,0,0.0,45.758076,0.164204
1020004,8513,58,32,0,43,9,2,0.046512,1256,0.34572,122,0.048088,0,0.0,0,0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.044335,0.077081,0.702703,0.116304,0.0,0.0,0.555993,0.458369,8,1,2.75,4.5,6.25,3219,955,1521.0,2087.0,2653.0,426,122,198.0,274.0,350.0,9671012,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,2.0,0.046696,1455.333374,0.440169,155.333328,0.062011,0.0,0.0,0.0,1,0.02381,1584,0.458333,122,0.058095,0,0.0,0,0.0,43.60033,0.027525
923014,7781,47,20,0,31,9,16,0.32,1165,0.549269,340,0.074725,0,0.0,0,0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.036321,0.0,0.857975,0.079126,0.0,0.0,0.564999,0.696472,16,1,4.75,8.5,12.25,2406,803,1203.75,1604.5,2005.25,675,171,297.0,423.0,549.0,6012075,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1107.666626,0.574575,293.333344,0.066597,0.0,0.0,0.0,0,0.0,1294,0.643781,305,0.065719,0,0.0,0,0.0,61.008957,0.0
1967631,16138,42,23,6,65,12,53,1.06,4116,1.514906,3557,1.580889,156,3.466667,6,6,5289500,692500650,287431430,253.635513,254.421066,1425.738525,105790,5.07271,0.036805,0.036805,0.03504,0.523923,0.9608,0.71634,2.336927,1.666667,0.823613,0.823613,53,8,19.25,30.5,41.75,4116,1392,2073.0,2754.0,3435.0,3557,882,1550.75,2219.5,2888.25,13850013,195,34,74.25,114.5,154.75,6,0,1.5,3.0,4.5,20.0,0.393333,2055.0,0.889812,1558.666626,0.776703,66.666664,1.662616,1.666667,26,0.52,2359,0.953902,1833,0.814667,93,2.066667,1,1.0,86.8237,0.956938
1036872,8690,59,31,0,55,8,11,0.261905,1065,0.30816,5742,0.245448,76,0.136445,0,0,2373336,435573768,195291648,113.803246,291.39856,1126.504883,56508,2.709601,0.028935,0.028935,0.043519,0.267375,0.470798,0.359622,0.209109,0.368421,0.626173,0.498929,22,5,9.25,13.5,17.75,1402,519,739.75,960.5,1181.25,11613,4757,6471.0,8185.0,9899.0,10370804,126,59,75.75,92.5,109.25,3,0,0.75,1.5,2.25,11.333333,0.26198,803.666687,0.25371,5819.333496,0.248065,92.333336,0.170638,0.0,15,0.348837,876,0.264493,6036,0.255222,96,0.174545,0,0.0,25.347221,0.434028
2123510,17756,55,20,0,43,9,7,0.14,793,0.37388,259,0.094182,0,0.0,0,0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.043519,0.138741,0.342578,0.102473,0.0,0.0,0.422916,0.505643,12,1,3.75,6.5,9.25,1203,427,621.0,815.0,1009.0,426,122,198.0,274.0,350.0,6012075,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,8.0,0.160979,555.0,0.28924,336.0,0.125365,0.0,0.0,0.0,5,0.098039,581,0.289055,314,0.114016,0,0.0,0,0.0,27.39274,0.235738
1325138,11069,50,33,0,40,10,8,0.181818,900,0.225507,7106,0.294708,107,0.195255,0,0,2573868,530591380,233461527,123.418907,728.626282,897.310242,58497,2.804975,0.025056,0.025056,0.029407,0.540284,0.484621,0.394343,0.345285,0.4,0.45655,0.358953,68,8,23.0,38.0,53.0,1554,665,887.25,1109.5,1331.75,9071,3834,5143.25,6452.5,7761.75,12058895,181,62,91.75,121.5,151.25,2,0,0.5,1.0,1.5,16.333334,0.383721,942.333313,0.272782,6467.333496,0.326357,99.333336,0.213819,0.0,12,0.27907,848,0.233416,5930,0.272543,100,0.197628,0,0.0,21.247808,0.300677


In [42]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'shop_item_share_of_shop_units_mean', 'item_me', 'shop_me',
       'category_me', 'shop_category_me', 'shop_item_me', 'month_me',
       'block_me', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_q0.25', 'item_minmax_q0.5', 'item_minmax_q0.75',
       'shop_max_units_block', 'shop_min_units_block',
       'shop_minmax_q0.25', 'shop_minmax_q0.5', 'shop_minmax_q0.75',
  

In [49]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [44]:
features = [
    
    'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'month_me', 'block_me', 'item_max_units_block',
       'item_min_units_block', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_q0.25', 'shop_minmax_q0.5',
       'shop_minmax_q0.75', 'cat_max_units_block', 'cat_min_units_block',
       'cat_minmax_q0.25', 'cat_minmax_q0.5', 'cat_minmax_q0.75',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_q0.25',
       'shop_cat_minmax_q0.5', 'shop_cat_minmax_q0.75',
       'shop_item_max_units_block', 'shop_item_min_units_block',
       'shop_item_minmax_q0.25', 'shop_item_minmax_q0.5',
       'shop_item_minmax_q0.75', 'item_block_units_rolling_3',
       'item_block_mean_rolling_3', 'shop_block_units_rolling_3',
       'shop_block_mean_rolling_3', 'cat_block_units_rolling_3',
       'cat_block_mean_rolling_3', 'shop_cat_block_units_rolling_3',
       'shop_cat_block_mean_rolling_3', 'shop_item_block_mean_rolling_3',
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'shop_block_units_lag_comp1', 'item_block_units_lag_comp1',
    'shop_item_share_of_shop_units_mean'


]

In [45]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
           'shop_cat_block_mean_rolling_3',



      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
            'shop_cat_block_mean_lag_1',
    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',
    'shop_item_block_mean_lag_1',
    
    'cat_me_real'

]




In [50]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 1,
                             bagging_temperature = 30,
                             cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6489952	test: 1.3255744	best: 1.3255744 (0)	total: 133ms	remaining: 2m 12s
1:	learn: 1.6360833	test: 1.3161121	best: 1.3161121 (1)	total: 223ms	remaining: 1m 51s
2:	learn: 1.6210921	test: 1.3071854	best: 1.3071854 (2)	total: 309ms	remaining: 1m 42s
3:	learn: 1.6072970	test: 1.2987835	best: 1.2987835 (3)	total: 421ms	remaining: 1m 44s
4:	learn: 1.5932814	test: 1.2899082	best: 1.2899082 (4)	total: 484ms	remaining: 1m 36s
5:	learn: 1.5815314	test: 1.2830533	best: 1.2830533 (5)	total: 576ms	remaining: 1m 35s
6:	learn: 1.5668766	test: 1.2735402	best: 1.2735402 (6)	total: 634ms	remaining: 1m 29s
7:	learn: 1.5574623	test: 1.2674831	best: 1.2674831 (7)	total: 707ms	remaining: 1m 27s
8:	learn: 1.5461364	test: 1.2621023	best: 1.2621023 (8)	total: 790ms	remaining: 1m 26s
9:	learn: 1.5376491	test: 1.2562009	best: 1.2562009 (9)	total: 879ms	remaining: 1m 27s
10:	learn: 1.5271767	test: 1.2509335	best: 1.2509335 (10)	total: 998ms	remaining: 1m 29s
11:	learn: 1.5157625	test: 1.2424675	best

95:	learn: 1.2425788	test: 1.0814862	best: 1.0814862 (95)	total: 8.5s	remaining: 1m 20s
96:	learn: 1.2422376	test: 1.0814215	best: 1.0814215 (96)	total: 8.58s	remaining: 1m 19s
97:	learn: 1.2393810	test: 1.0795945	best: 1.0795945 (97)	total: 8.66s	remaining: 1m 19s
98:	learn: 1.2391957	test: 1.0795083	best: 1.0795083 (98)	total: 8.73s	remaining: 1m 19s
99:	learn: 1.2380537	test: 1.0788017	best: 1.0788017 (99)	total: 8.8s	remaining: 1m 19s
100:	learn: 1.2376230	test: 1.0785621	best: 1.0785621 (100)	total: 8.88s	remaining: 1m 19s
101:	learn: 1.2373340	test: 1.0783323	best: 1.0783323 (101)	total: 8.96s	remaining: 1m 18s
102:	learn: 1.2366965	test: 1.0779288	best: 1.0779288 (102)	total: 9.04s	remaining: 1m 18s
103:	learn: 1.2360018	test: 1.0774977	best: 1.0774977 (103)	total: 9.14s	remaining: 1m 18s
104:	learn: 1.2346561	test: 1.0766614	best: 1.0766614 (104)	total: 9.29s	remaining: 1m 19s
105:	learn: 1.2341217	test: 1.0764274	best: 1.0764274 (105)	total: 9.4s	remaining: 1m 19s
106:	learn: 

[('shop_item_block_mean_lag_1', 23.71565136501142),
 ('item_block_mean_lag_1', 23.215630417612566),
 ('shop_item_block_mean_rolling_3', 11.333917681025751),
 ('shop_cat_block_mean_rolling_3', 8.141379456734041),
 ('shop_cat_block_mean_lag_1', 7.85224613076059),
 ('item_block_mean_rolling_3', 7.413731008528507),
 ('item_category_id', 5.373347189247112),
 ('shop_item_share_of_shop_units_mean', 3.9771815410886022),
 ('shop_block_mean_lag_1', 3.7014969239778712),
 ('shop_block_mean_rolling_3', 2.7073566374972673),
 ('cat_me_real', 2.5680616485162626)]

In [None]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [51]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [52]:
item_features = [ 
    'shop_item_share_of_shop_units_mean'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [53]:
shop_features = [
        'shop_me'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [54]:
cat_features = [
        'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [55]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]


In [56]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [57]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [58]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [59]:
test.fillna(0, inplace=True)

In [None]:
test.sample(10)

In [60]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.08082375, 0.05481306, 0.20990818, ..., 0.13196174, 0.16219354,
       0.13532393])

In [61]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.3648159684964606
11.598489077980048


In [None]:
cb_preds[0:100]

In [62]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [47]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)

