In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [11]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [12]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
41112,16009,12,0.326087
21768,8508,21,0.269231
49217,19127,28,0.068182
7943,3328,23,0.04
38512,15161,33,0.022727
28029,10943,20,0.18
47293,18425,23,0.14
50860,19888,22,0.1
25472,10224,27,0.042553
19933,7834,18,7.74


In [13]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols,as_index=False)['item_cnt_block'].median()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_median})
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='float')
    del block_units_med
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols,as_index=False)['item_cnt_block'].max()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols,as_index=False)['item_cnt_block'].min()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols,as_index=False)['item_cnt_block'].std()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_stats(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_stats(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block




shop_block
cat_block
shop_cat_block
shop_item_block


In [14]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 54
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [15]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


training.fillna(0,inplace=True)

fold 1
fold 2
fold 3
fold 4
fold 5


In [16]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [17]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_median')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_min')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_max')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_std')

training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
item_block_median 3
item_block_min 3
item_block_max 3
item_block_std 3
shop_block_units 3
shop_block_mean 3
shop_block_median 3
shop_block_min 3
shop_block_max 3
shop_block_std 3
cat_block_units 3
cat_block_mean 3
cat_block_median 3
cat_block_min 3
cat_block_max 3
cat_block_std 3
shop_cat_block_units 3
shop_cat_block_mean 3
shop_cat_block_median 3
shop_cat_block_min 3
shop_cat_block_max 3
shop_cat_block_std 3


In [18]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')


shop_item_block_mean 3


In [19]:
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

training['block_total'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.sum)

training['item_share_block'] = training['item_block_units'] * 100 / training['block_total']
training['shop_share_block'] = training['shop_block_units'] * 100 / training['block_total']
training['comp2'] = training['item_share_block'] * training['shop_share_block']

In [20]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "units" in name:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_median')                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_min')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_max')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

item_block_units 1
item_block_mean 1
item_block_median 1
item_block_min 1
item_block_max 1
item_block_std 1
shop_block_units 1
shop_block_mean 1
shop_block_median 1
shop_block_min 1
shop_block_max 1
shop_block_std 1
cat_block_units 1
cat_block_mean 1
cat_block_median 1
cat_block_min 1
cat_block_max 1
cat_block_std 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_cat_block_median 1
shop_cat_block_min 1
shop_cat_block_max 1
shop_cat_block_std 1
shop_item_block_units 1


KeyError: Index(['date_blocbk_num'], dtype='object')

In [21]:
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [22]:
training = add_lags(training, ['item_id','date_block_num'], 'item_share_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_share_block')
training = add_lags(training, ['shop_id', 'item_id', 'date_block_num'], 'comp2')

item_share_block 1
shop_share_block 1
comp2 1


In [23]:
total_sum_shops = training.groupby('shop_id')['item_cnt_block'].sum().sum()
training['shop_share'] = training.groupby('shop_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_shops

total_sum_items = training.groupby('item_id')['item_cnt_block'].sum().sum()
training['item_share'] = training.groupby('item_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_items

training['comp1'] = training['shop_share'] * training['item_share']

In [24]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [25]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [26]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned')

training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [27]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_median,item_block_mean,item_block_max,item_block_min,item_block_std,shop_block_units,shop_block_median,shop_block_mean,shop_block_max,shop_block_min,shop_block_std,cat_block_units,cat_block_median,cat_block_mean,cat_block_max,cat_block_min,cat_block_std,shop_cat_block_units,shop_cat_block_median,shop_cat_block_mean,shop_cat_block_max,shop_cat_block_min,shop_cat_block_std,shop_item_block_units,shop_item_block_median,shop_item_block_mean,shop_item_block_max,shop_item_block_min,shop_item_block_std,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,item_block_median_rolling_3,item_block_min_rolling_3,item_block_max_rolling_3,item_block_std_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,shop_block_median_rolling_3,shop_block_min_rolling_3,shop_block_max_rolling_3,shop_block_std_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,cat_block_median_rolling_3,cat_block_min_rolling_3,cat_block_max_rolling_3,cat_block_std_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_cat_block_min_rolling_3,shop_cat_block_max_rolling_3,shop_cat_block_std_rolling_3,shop_item_block_mean_rolling_3,block_total,item_share_block,shop_share_block,comp2,item_block_units_lag_1,item_block_mean_lag_1,item_block_median_lag_1,item_block_min_lag_1,item_block_max_lag_1,item_block_std_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,shop_block_median_lag_1,shop_block_min_lag_1,shop_block_max_lag_1,shop_block_std_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,cat_block_median_lag_1,cat_block_min_lag_1,cat_block_max_lag_1,cat_block_std_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1,shop_cat_block_min_lag_1,shop_cat_block_max_lag_1,shop_cat_block_std_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_item_block_median_lag_1,shop_item_block_min_lag_1,shop_item_block_max_lag_1,shop_item_block_std_lag_1,item_share_block_lag_1,shop_share_block_lag_1,comp2_lag_1,shop_share,item_share,comp1,shop_block_units_lag_comp1,item_block_units_lag_comp1,blocks_without_sales,item_mean_day_between_activity
1243271,10578,43,12,0,57,1,0.085308,0.514956,0.112724,0.567063,0.63953,6,0.0,0.130435,3,0,0.0,711,0.0,0.465924,20,0,0.0,473,0.0,0.144825,8,0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,2059098,183422884,68308338,98.735298,557.388733,1215.967041,44763,2.14642,0.065531,0.065531,0.043519,10,1,3.25,5.5,7.75,2089,711,1055.5,1400.0,1744.5,656,302,390.5,479.0,567.5,3987454,2,0,0.5,1.0,1.5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44763.0,0.013404,0.124299,0.001666,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.08355,0.00797,0.008636,0.0,0.0,22,0.090792
1829696,15287,58,17,0,63,6,1.277443,0.562055,0.440359,0.436443,0.532963,86,1.0,1.755102,8,0,0.0,1086,0.0,0.594417,20,0,0.0,735,0.0,0.405405,8,0,0.0,17,0.0,0.459459,2,0,0.0,0,0.0,0,0,0,0.0,2331175,276254895,86919525,111.781593,889.815247,48.918354,47575,2.281257,0.054735,0.054735,0.043519,137,13,44.0,75.0,106.0,3219,955,1521.0,2087.0,2653.0,3069,728,1313.25,1898.5,2483.75,5637855,118,6,34.0,62.0,90.0,4,0,1.0,2.0,3.0,105.333336,2.169076,1.333333,0.0,10.666667,0.0,1095.666626,0.651522,0.0,0.0,20.0,0.0,854.0,0.488108,0.0,0.0,12.0,0.0,16.0,0.445395,0.0,0.0,2.333333,0.0,0.666667,47575.0,0.180767,0.905181,0.163627,92,1.877551,1.0,0.0,10.0,0.0,955,0.546964,0.0,0.0,20.0,0.0,785,0.457726,0.0,0.0,10.0,0.0,14,0.4,0.0,0.0,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.207385,0.675443,0.140077,2.549017,0.108873,0.277518,52.271484,5.035578,22,0.006791
614170,4907,31,24,7,23,1,1.315175,1.701997,0.834251,0.567162,0.530636,89,1.0,1.78,8,0,0.0,5239,1.0,1.931073,20,0,0.0,4426,0.0,0.972747,20,0,0.0,214,2.0,2.351648,20,0,0.0,7,7.0,7,7,7,0.0,3586650,490180900,194611629,171.982574,850.382507,1093.897461,71733,3.439651,0.03686,0.03686,0.044209,119,4,32.75,61.5,90.25,8065,3047,4301.5,5556.0,6810.5,7475,1923,3311.0,4699.0,6087.0,9803618,355,97,161.5,226.0,290.5,7,0,1.75,3.5,5.25,77.666664,1.537436,1.333333,0.0,7.333333,0.0,5706.0,2.250844,1.0,0.0,20.0,0.0,4708.333496,1.086746,0.0,0.0,20.0,0.0,243.0,2.830835,1.666667,0.0,18.666666,0.0,4.666667,71733.0,0.124071,0.908201,0.112682,119,2.38,2.0,0.0,12.0,0.0,8065,2.968348,1.0,0.0,20.0,0.0,7475,1.625,0.0,0.0,20.0,0.0,355,3.858696,2.0,0.0,20.0,0.0,6,6.0,6.0,6.0,6.0,0.0,0.112487,0.189697,0.021338,7.735218,0.106654,0.824993,297.2724,4.386288,21,0.00695
748108,6005,47,25,1,55,2,0.484887,0.54328,0.313385,0.511585,0.455522,17,0.0,0.361702,1,0,0.0,1595,0.0,0.579789,20,0,0.0,6820,0.0,0.294932,15,0,0.0,142,0.0,0.288618,3,0,0.0,1,1.0,1,1,1,0.0,2779674,396919512,162699642,133.28746,497.367035,1623.162842,59142,2.835903,0.03635,0.03635,0.044209,37,3,11.5,20.0,28.5,2406,803,1203.75,1604.5,2005.25,11613,4757,6471.0,8185.0,9899.0,8445096,229,66,106.75,147.5,188.25,2,0,0.5,1.0,1.5,29.666666,0.593333,0.0,0.0,3.333333,0.0,1917.666626,0.725785,0.0,0.0,20.0,0.0,9203.666992,0.383218,0.0,0.0,20.0,0.0,189.666672,0.39513,0.0,0.0,9.333333,0.0,0.666667,59142.0,0.028744,0.480674,0.013817,29,0.58,0.0,0.0,2.0,0.0,1722,0.634722,0.0,0.0,20.0,0.0,7942,0.325492,0.0,0.0,20.0,0.0,166,0.340164,0.0,0.0,8.0,0.0,1,1.0,1.0,1.0,1.0,0.0,0.040428,0.573348,0.023179,2.484515,0.040098,0.099624,62.595421,1.054162,21,0.011389
1498830,12686,24,18,1,40,7,0.331586,0.419158,0.406684,0.412798,0.488332,21,0.0,0.42,2,0,0.0,901,0.0,0.467566,20,0,0.0,4800,0.0,0.393443,20,0,0.0,70,0.0,0.286885,5,0,0.0,1,1.0,1,1,1,0.0,2371450,309332150,91395683,113.712814,416.425415,263.553497,47429,2.274256,0.051894,0.051894,0.041799,41,1,11.0,21.0,31.0,1971,794,1088.25,1382.5,1676.75,9071,3834,5143.25,6452.5,7761.75,6186643,141,32,59.25,86.5,113.75,1,0,0.25,0.5,0.75,4.0,0.082076,0.0,0.0,1.333333,0.0,871.0,0.495112,0.0,0.0,20.0,0.0,4835.0,0.501314,0.0,0.0,20.0,0.0,82.666664,0.420462,0.0,0.0,11.333333,0.0,0.0,47429.0,0.044277,0.517911,0.022931,9,0.183673,0.0,0.0,2.0,0.0,941,0.515052,0.0,0.0,20.0,0.0,4806,0.492873,0.0,0.0,20.0,0.0,83,0.417085,0.0,0.0,9.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.018917,0.600399,0.011358,1.922239,0.024157,0.046436,48.832382,0.467047,20,0.023014
515593,4244,46,30,1,23,7,2.401949,0.495851,0.834251,0.412718,0.358381,77,1.0,1.790698,13,0,0.0,1438,0.0,0.434179,20,0,0.0,2598,0.0,0.525379,18,0,0.0,69,0.0,0.6,8,0,0.0,1,1.0,1,1,1,0.0,2212522,407190478,170415648,106.092094,989.867615,1993.151123,51454,2.467258,0.030193,0.030193,0.043519,384,17,108.75,200.5,292.25,2273,858,1211.75,1565.5,1919.25,7475,1923,3311.0,4699.0,6087.0,9469546,206,58,95.0,132.0,169.0,7,0,1.75,3.5,5.25,27.666666,0.624582,0.333333,0.0,4.0,0.0,1171.333374,0.383704,0.0,0.0,20.0,0.0,2308.666748,0.493165,0.0,0.0,16.0,0.0,73.0,0.693355,0.0,0.0,7.0,0.0,0.666667,51454.0,0.149648,0.247367,0.037018,42,0.976744,1.0,0.0,6.0,0.0,1295,0.412552,0.0,0.0,20.0,0.0,2724,0.621067,0.0,0.0,19.0,0.0,96,0.941176,0.0,0.0,10.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.082956,1.263387,0.104806,2.270878,0.214376,0.486823,39.100243,1.268116,22,0.006059
1779426,15063,26,27,0,30,4,1.315101,0.395247,1.520122,0.403327,0.360533,27,0.0,0.574468,4,0,0.0,1088,0.0,0.366948,20,0,0.0,6827,0.0,1.200457,20,0,0.0,181,0.0,1.495868,20,0,0.0,0,0.0,0,0,0,0.0,2367014,412107515,149323330,113.500099,1225.643188,981.758606,50362,2.414896,0.033727,0.033727,0.043519,159,4,42.75,81.5,120.25,1984,616,958.0,1300.0,1642.0,10108,3862,5423.5,6985.0,8546.5,8768245,263,84,128.75,173.5,218.25,4,0,1.0,2.0,3.0,63.666668,1.323867,1.0,0.0,7.0,0.0,1088.666626,0.3905,0.0,0.0,20.0,0.0,7300.333496,1.348205,0.333333,0.0,20.0,0.0,126.0,1.109931,1.0,0.0,13.333333,0.0,0.333333,50362.0,0.053612,0.85906,0.046056,49,1.065217,1.0,0.0,5.0,0.0,1110,0.381706,0.0,0.0,20.0,0.0,6558,1.218506,0.0,0.0,20.0,0.0,124,1.059829,1.0,0.0,11.0,0.0,1,1.0,1.0,1.0,1.0,0.0,0.083525,0.774977,0.06473,1.807697,0.112735,0.20379,37.43676,1.652614,22,0.007356
1759156,14846,28,17,12,65,6,1.067692,1.111086,0.772835,0.436443,0.532963,181,1.0,3.693877,20,0,0.0,2644,0.0,1.447181,20,0,0.0,1109,0.0,0.984028,20,0,0.0,71,2.0,3.086957,12,0,0.0,12,12.0,12,12,12,0.0,2331175,276254895,86919525,111.781593,889.815247,48.918354,47575,2.281257,0.054735,0.054735,0.038234,181,3,47.5,92.0,136.5,4524,1882,2542.5,3203.0,3863.5,3557,882,1550.75,2219.5,2888.25,5637855,209,34,77.75,121.5,165.25,15,0,3.75,7.5,11.25,0.0,0.0,0.0,0.0,0.0,0.0,2150.0,1.276341,0.0,0.0,20.0,0.0,1008.333313,0.956137,0.0,0.0,20.0,0.0,45.0,2.069986,1.0,0.0,9.0,0.0,0.0,47575.0,0.380452,0.04742,0.018041,0,0.0,0.0,0.0,0.0,0.0,2160,1.237113,0.0,0.0,20.0,0.0,936,0.868275,0.0,0.0,20.0,0.0,49,2.227273,1.0,0.0,13.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.437131,0.0,5.070755,0.070336,0.356656,118.226608,0.0,17,0.011801
367057,3280,19,32,0,55,9,0.167658,0.474546,0.314227,0.394956,0.344902,3,0.0,0.069767,1,0,0.0,1127,0.0,0.310212,20,0,0.0,4757,0.0,0.192062,10,0,0.0,124,0.0,0.215278,4,0,0.0,0,0.0,0,0,0,0.0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.043519,13,1,4.0,7.0,10.0,2312,808,1184.0,1560.0,1936.0,11613,4757,6471.0,8185.0,9899.0,9671012,350,97,160.25,223.5,286.75,1,0,0.25,0.5,0.75,4.0,0.093577,0.0,0.0,1.666667,0.0,1238.0,0.374171,0.0,0.0,20.0,0.0,5802.333496,0.247697,0.0,0.0,15.666667,0.0,142.666672,0.259188,0.0,0.0,5.666667,0.0,0.0,53347.0,0.005624,0.884098,0.004972,3,0.071429,0.0,0.0,2.0,0.0,1378,0.398727,0.0,0.0,20.0,0.0,5742,0.245448,0.0,0.0,13.0,0.0,170,0.305206,0.0,0.0,5.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.005309,0.119063,0.000632,2.158226,0.014379,0.031034,37.930084,0.082576,22,0.034826
865731,7006,12,20,0,20,9,0.985677,0.352225,1.769135,0.393589,0.470466,2,0.0,0.04,1,0,0.0,690,0.0,0.325318,20,0,0.0,4646,1.0,3.318571,20,0,0.0,96,0.0,3.428571,20,0,0.0,0,0.0,0,0,0,0.0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.041472,374,2,95.0,188.0,281.0,1783,453,785.5,1118.0,1450.5,8130,1407,3087.75,4768.5,6449.25,6012075,256,17,76.75,136.5,196.25,20,0,5.0,10.0,15.0,29.0,0.580656,0.333333,0.0,3.0,0.0,508.333344,0.264729,0.0,0.0,20.0,0.0,2974.0,2.817565,1.0,0.0,20.0,0.0,44.666668,2.120155,0.666667,0.0,20.0,0.0,0.0,50096.0,0.003992,0.069147,0.000276,21,0.411765,0.0,0.0,3.0,0.0,528,0.262687,0.0,0.0,20.0,0.0,3551,3.027281,1.0,0.0,20.0,0.0,57,2.478261,1.0,0.0,20.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.036807,0.925423,0.034062,1.598251,0.071897,0.11491,24.893917,0.990099,20,0.011615


In [28]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [29]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [53]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [54]:
features = [
    
'item_id', 'shop_id', 'date_block_num', 
       'item_category_id', 'month', 
    
    'item_id_mean_encoding',
       'shop_id_mean_encoding', 
    'item_category_id_mean_encoding',
       'month_mean_encoding', 
    'date_block_num_mean_encoding',
    

       'item_block_units_rolling_3', 'item_block_mean_rolling_3',
       'item_block_median_rolling_3', 'item_block_min_rolling_3',
       'item_block_max_rolling_3', 'item_block_std_rolling_3',
    
       'shop_block_units_rolling_3', 'shop_block_mean_rolling_3',
       'shop_block_median_rolling_3', 'shop_block_min_rolling_3',
       'shop_block_max_rolling_3', 'shop_block_std_rolling_3',
    
       'cat_block_units_rolling_3', 'cat_block_mean_rolling_3',
       'cat_block_median_rolling_3', 'cat_block_min_rolling_3',
       'cat_block_max_rolling_3', 'cat_block_std_rolling_3',
    
       'shop_cat_block_units_rolling_3', 'shop_cat_block_mean_rolling_3',
       'shop_cat_block_median_rolling_3', 'shop_cat_block_min_rolling_3',
       'shop_cat_block_max_rolling_3', 'shop_cat_block_std_rolling_3',
    
       'shop_item_block_mean_rolling_3',
    
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'item_block_median_lag_1', 'item_block_min_lag_1',
       'item_block_max_lag_1', 'item_block_std_lag_1',
    
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'shop_block_median_lag_1', 'shop_block_min_lag_1',
       'shop_block_max_lag_1', 'shop_block_std_lag_1',
    
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'cat_block_median_lag_1', 'cat_block_min_lag_1',
       'cat_block_max_lag_1', 'cat_block_std_lag_1',
    
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_cat_block_median_lag_1', 'shop_cat_block_min_lag_1',
       'shop_cat_block_max_lag_1', 'shop_cat_block_std_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'shop_item_block_median_lag_1', 'shop_item_block_min_lag_1',
       'shop_item_block_max_lag_1', 'shop_item_block_std_lag_1',
    
       'item_share_block_lag_1', 'shop_share_block_lag_1', 'comp2_lag_1',
       'shop_share', 'item_share', 
   'shop_block_units_lag_comp1',
       'item_block_units_lag_comp1', 'blocks_without_sales',
       'item_mean_day_between_activity'


]

In [89]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',

       'shop_block_mean_rolling_3',

           'shop_cat_block_mean_rolling_3',
               'shop_cat_block_median_rolling_3',

      'item_block_mean_lag_1',

        'shop_block_mean_lag_1',

            'shop_cat_block_mean_lag_1',
               # 'shop_cat_block_median_lag_1',


    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',

    'shop_item_block_mean_lag_1',

    
#'item_id_mean_encoding',
       #'shop_id_mean_encoding',
    'item_category_id_mean_encoding',  
    #'month_mean_encoding', 'date_block_num_mean_encoding'
    
    'shop_share',
    
    #'item_mean_day_between_activity',
    #'comp1'

]




In [94]:
features = ['item_id',
 'shop_id',
 'date_block_num',
 'item_id_mean_encoding',
 'item_block_mean_rolling_3',
 'item_block_mean_lag_1',
 'item_block_max_lag_1',
 'shop_cat_block_mean_lag_1',
 'shop_cat_block_median_lag_1',
]

In [61]:
features

['item_id',
 'shop_id',
 'date_block_num',
 'item_id_mean_encoding',
 'item_block_mean_rolling_3',
 'item_block_mean_lag_1',
 'item_block_max_lag_1',
 'shop_cat_block_mean_lag_1',
 'shop_cat_block_median_lag_1',
 'shop_item_block_units_lag_1',
 'shop_item_block_mean_lag_1',
 'shop_item_block_median_lag_1',
 'shop_item_block_min_lag_1',
 'shop_item_block_max_lag_1']

In [115]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 3,
                             border_count=254,
                             #bagging_temperature = 5, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             #random_strength = 4,#default 1  adds randomness to the split score
                             #depth=4,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             cat_features=[0,1,2],
                             #cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6455941	test: 1.3243057	best: 1.3243057 (0)	total: 307ms	remaining: 5m 6s
1:	learn: 1.6235147	test: 1.3040336	best: 1.3040336 (1)	total: 559ms	remaining: 4m 38s
2:	learn: 1.6029963	test: 1.2924308	best: 1.2924308 (2)	total: 838ms	remaining: 4m 38s
3:	learn: 1.5827702	test: 1.2741803	best: 1.2741803 (3)	total: 1.09s	remaining: 4m 31s
4:	learn: 1.5635732	test: 1.2609721	best: 1.2609721 (4)	total: 1.36s	remaining: 4m 30s
5:	learn: 1.5449793	test: 1.2472201	best: 1.2472201 (5)	total: 1.64s	remaining: 4m 31s
6:	learn: 1.5267673	test: 1.2350562	best: 1.2350562 (6)	total: 1.89s	remaining: 4m 28s
7:	learn: 1.5097894	test: 1.2204463	best: 1.2204463 (7)	total: 2.19s	remaining: 4m 31s
8:	learn: 1.4930030	test: 1.2084267	best: 1.2084267 (8)	total: 2.5s	remaining: 4m 35s
9:	learn: 1.4767318	test: 1.1974557	best: 1.1974557 (9)	total: 2.67s	remaining: 4m 24s
10:	learn: 1.4613408	test: 1.1862020	best: 1.1862020 (10)	total: 2.83s	remaining: 4m 14s
11:	learn: 1.4459298	test: 1.1754934	best: 

[('item_id_mean_encoding', 28.222244932583557),
 ('shop_cat_block_mean_lag_1', 17.214229566790102),
 ('shop_id', 11.966290369157145),
 ('item_block_mean_rolling_3', 11.487908383286117),
 ('item_block_mean_lag_1', 9.506315231173872),
 ('item_block_max_lag_1', 6.910411267055416),
 ('item_id', 5.890034339294305),
 ('shop_cat_block_median_lag_1', 5.2236932089301735),
 ('date_block_num', 3.57887270172931)]

In [60]:
features = [item[0] for item in scores.items() if item[1] > 2]

In [96]:
#test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [97]:
item_features = [ 
    'shop_item_share_of_shop_units_mean','item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [98]:
shop_features = [
        'shop_id_mean_encoding','shop_share'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [99]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [100]:
shop_item_features = [
        'shop_item_share_of_shop_units_mean'#,'cat_me_real'
]

merge_col = ['shop_id','item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [101]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
#test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
#test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')





item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]


In [203]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [103]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_max')
#est = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')





item_block_mean 1
item_block_max 1
shop_cat_block_mean 1
shop_cat_block_median 1


In [205]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [107]:
test['date_block_num'] = 34

In [104]:
test.fillna(0, inplace=True)

In [105]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,shop_item_share_of_shop_units_mean_x,item_id_mean_encoding,shop_id_mean_encoding,shop_share,item_category_id_mean_encoding,shop_item_share_of_shop_units_mean_y,item_block_mean_rolling_3,item_block_mean_lag_1,item_block_max_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1
38022,3935,64907,22,23,0.043519,0.619883,0.369184,1.687157,0.835511,0.043519,0.202884,0.209302,2.0,0.229508,0.0
30871,3351,5159,4,20,0.026277,12.507042,0.29307,1.32496,1.803645,0.026277,12.729123,17.16279,20.0,0.366337,0.0
33881,3579,150005,59,55,0.0,0.0,0.320352,1.463167,0.312997,0.0,0.0,0.0,0.0,0.104167,0.0
108968,11382,104389,42,43,0.044094,0.076923,0.919194,4.181697,0.102186,0.044094,0.038953,0.046512,1.0,0.0,0.0
96489,10203,78139,15,20,0.0,0.0,0.445535,2.035878,1.803645,0.0,0.0,0.0,0.0,1.29703,0.0
61172,6175,106361,42,31,0.039423,0.034711,0.919194,4.181697,0.076758,0.039423,0.031016,0.046512,2.0,0.0,0.0
101853,10661,18302,3,67,0.043519,0.302774,0.226619,1.028662,0.242629,0.043519,0.226333,0.209302,1.0,0.0,0.0
58276,5905,113193,49,30,0.025056,1.34375,0.219211,0.999492,1.51438,0.025056,0.0,0.0,0.0,0.319672,0.0
28491,3150,81228,15,76,0.041862,0.034417,0.445535,2.035878,0.07936,0.041862,0.023264,0.023256,1.0,0.0,0.0
159567,16037,49278,31,65,0.034742,0.484234,1.701997,7.735218,0.776279,0.034742,0.0,0.0,0.0,2.103896,1.0


In [116]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.10450991, 0.09955971, 0.15285815, ..., 0.28464604, 0.30054303,
       0.28464604])

In [117]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.39439764904288827
15.855151161027607


In [None]:
cb_preds[0:100]

In [118]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)

