In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [153]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [154]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [11]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [12]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
4104,1857,17,1.938776
17794,6689,21,0.326923
29018,11355,33,0.022727
39284,15299,15,0.44898
5612,2574,25,0.553191
34552,13573,27,0.021277
11241,4336,22,0.84
49209,19127,17,0.020408
25528,10237,33,0.068182
23672,9403,24,0.84


In [13]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols,as_index=False)['item_cnt_block'].median()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_median})
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='float')
    del block_units_med
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols,as_index=False)['item_cnt_block'].max()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols,as_index=False)['item_cnt_block'].min()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols,as_index=False)['item_cnt_block'].std()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_stats(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_stats(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block




shop_block
cat_block
shop_cat_block
shop_item_block


In [14]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 54
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [15]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


training.fillna(0,inplace=True)

fold 1
fold 2
fold 3
fold 4
fold 5


In [16]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [17]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_median')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_min')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_max')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_std')

training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
item_block_median 3
item_block_min 3
item_block_max 3
item_block_std 3
shop_block_units 3
shop_block_mean 3
shop_block_median 3
shop_block_min 3
shop_block_max 3
shop_block_std 3
cat_block_units 3
cat_block_mean 3
cat_block_median 3
cat_block_min 3
cat_block_max 3
cat_block_std 3
shop_cat_block_units 3
shop_cat_block_mean 3
shop_cat_block_median 3
shop_cat_block_min 3
shop_cat_block_max 3
shop_cat_block_std 3


In [18]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')


shop_item_block_mean 3


In [19]:
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

training['block_total'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.sum)

training['item_share_block'] = training['item_block_units'] * 100 / training['block_total']
training['shop_share_block'] = training['shop_block_units'] * 100 / training['block_total']
training['comp2'] = training['item_share_block'] * training['shop_share_block']

In [20]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "units" in name:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_median')                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_min')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_max')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

item_block_units 1
item_block_mean 1
item_block_median 1
item_block_min 1
item_block_max 1
item_block_std 1
shop_block_units 1
shop_block_mean 1
shop_block_median 1
shop_block_min 1
shop_block_max 1
shop_block_std 1
cat_block_units 1
cat_block_mean 1
cat_block_median 1
cat_block_min 1
cat_block_max 1
cat_block_std 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_cat_block_median 1
shop_cat_block_min 1
shop_cat_block_max 1
shop_cat_block_std 1
shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [21]:
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [22]:
training = add_lags(training, ['item_id','date_block_num'], 'item_share_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_share_block')
training = add_lags(training, ['shop_id', 'item_id', 'date_block_num'], 'comp2')

item_share_block 1
shop_share_block 1
comp2 1


In [23]:
total_sum_shops = training.groupby('shop_id')['item_cnt_block'].sum().sum()
training['shop_share'] = training.groupby('shop_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_shops

total_sum_items = training.groupby('item_id')['item_cnt_block'].sum().sum()
training['item_share'] = training.groupby('item_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_items

training['comp1'] = training['shop_share'] * training['item_share']

In [24]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [25]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [26]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned')

training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [48]:
training.to_pickle("./training_project_one_cb.pickle")

In [49]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_median,item_block_mean,item_block_max,item_block_min,item_block_std,shop_block_units,shop_block_median,shop_block_mean,shop_block_max,shop_block_min,shop_block_std,cat_block_units,cat_block_median,cat_block_mean,cat_block_max,cat_block_min,cat_block_std,shop_cat_block_units,shop_cat_block_median,shop_cat_block_mean,shop_cat_block_max,shop_cat_block_min,shop_cat_block_std,shop_item_block_units,shop_item_block_median,shop_item_block_mean,shop_item_block_max,shop_item_block_min,shop_item_block_std,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,item_block_median_rolling_3,item_block_min_rolling_3,item_block_max_rolling_3,item_block_std_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,shop_block_median_rolling_3,shop_block_min_rolling_3,shop_block_max_rolling_3,shop_block_std_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,cat_block_median_rolling_3,cat_block_min_rolling_3,cat_block_max_rolling_3,cat_block_std_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_cat_block_min_rolling_3,shop_cat_block_max_rolling_3,shop_cat_block_std_rolling_3,shop_item_block_mean_rolling_3,block_total,item_share_block,shop_share_block,comp2,item_block_units_lag_1,item_block_mean_lag_1,item_block_median_lag_1,item_block_min_lag_1,item_block_max_lag_1,item_block_std_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,shop_block_median_lag_1,shop_block_min_lag_1,shop_block_max_lag_1,shop_block_std_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,cat_block_median_lag_1,cat_block_min_lag_1,cat_block_max_lag_1,cat_block_std_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1,shop_cat_block_min_lag_1,shop_cat_block_max_lag_1,shop_cat_block_std_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_item_block_median_lag_1,shop_item_block_min_lag_1,shop_item_block_max_lag_1,shop_item_block_std_lag_1,item_share_block_lag_1,shop_share_block_lag_1,comp2_lag_1,shop_share,item_share,comp1,shop_block_units_lag_comp1,item_block_units_lag_comp1,blocks_without_sales,item_mean_day_between_activity
989968,8256,50,15,0,38,4,0.072378,0.383495,0.223361,0.403369,0.473724,4,0.0,0.081633,2,0,0.0,974,0.0,0.572604,20,0,0.0,520,0.0,0.171165,14,0,0.0,2,0.0,0.032258,1,0,0.0,0,0.0,0,0,0,0.0,1935990,218971298,67206510,92.832176,202.492828,1163.133545,39510,1.894534,0.058789,0.058789,0.041185,17,1,5.0,9.0,13.0,1554,665,887.25,1109.5,1331.75,1891,374,753.25,1132.5,1511.75,4468802,30,2,9.0,16.0,23.0,1,0,0.25,0.5,0.75,1.666667,0.035628,0.0,0.0,1.0,0.0,1260.0,0.800871,0.0,0.0,20.0,0.0,531.333313,0.184225,0.0,0.0,10.666667,0.0,16.666666,0.265397,0.0,0.0,3.333333,0.0,0.0,39510.0,0.010124,0.806479,0.008165,2,0.041667,0.0,0.0,1.0,0.0,1332,0.821715,0.0,0.0,20.0,0.0,608,0.20765,0.0,0.0,20.0,0.0,12,0.196721,0.0,0.0,4.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.004061,0.043206,0.000175,1.754699,0.005423,0.009516,78.306877,0.117578,18,0.198308
1391125,11682,25,32,1,41,9,0.066176,1.356188,0.230487,0.392586,0.337746,2,0.0,0.046512,1,0,0.0,4157,0.0,1.144233,20,0,0.0,307,0.0,0.166036,6,0,0.0,21,0.0,0.488372,6,0,0.0,1,1.0,1,1,1,0.0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.037784,7,1,2.5,4.0,5.5,6738,2275,3390.75,4506.5,5622.25,700,131,273.25,415.5,557.75,9671012,55,3,16.0,29.0,42.0,1,0,0.25,0.5,0.75,2.666667,0.060775,0.0,0.0,1.0,0.0,3581.666748,1.08591,0.0,0.0,20.0,0.0,398.0,0.213457,0.0,0.0,7.333333,0.0,28.333334,0.644806,0.0,0.0,4.666667,0.0,0.333333,53347.0,0.003749,0.421467,0.00158,4,0.095238,0.0,0.0,1.0,0.0,3694,1.068866,0.0,0.0,20.0,0.0,387,0.204762,0.0,0.0,5.0,0.0,31,0.688889,0.0,0.0,5.0,0.0,1,1.0,1.0,1.0,1.0,0.0,0.007079,0.738303,0.005226,6.170246,0.003862,0.023829,101.679054,0.110102,14,0.278049
460887,3891,22,32,0,55,9,0.268793,0.370254,0.31405,0.393447,0.34037,5,0.0,0.116279,2,0,0.0,1131,0.0,0.311313,20,0,0.0,4757,0.0,0.192062,10,0,0.0,72,0.0,0.125,2,0,0.0,0,0.0,0,0,0,0.0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.033146,42,1,11.25,21.5,31.75,1592,636,875.0,1114.0,1353.0,11613,4757,6471.0,8185.0,9899.0,9671012,126,36,58.5,81.0,103.5,2,0,0.5,1.0,1.5,4.666667,0.10945,0.0,0.0,1.333333,0.0,877.666687,0.266176,0.0,0.0,20.0,0.0,5802.333496,0.247697,0.0,0.0,15.666667,0.0,75.333336,0.137319,0.0,0.0,4.666667,0.0,0.0,53347.0,0.009373,0.891597,0.008357,5,0.119048,0.0,0.0,1.0,0.0,901,0.260706,0.0,0.0,20.0,0.0,5742,0.245448,0.0,0.0,13.0,0.0,72,0.129264,0.0,0.0,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.008848,0.4347,0.003846,1.687157,0.011586,0.019547,24.80044,0.137627,12,0.045691
183689,1775,35,20,0,55,9,0.092417,0.469264,0.31405,0.393447,0.471686,5,0.0,0.1,1,0,0.0,1111,0.0,0.52381,20,0,0.0,6443,0.0,0.298979,20,0,0.0,175,0.0,0.406032,5,0,0.0,0,0.0,0,0,0,0.0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.043519,8,1,2.75,4.5,6.25,2364,744,1149.0,1554.0,1959.0,11613,4757,6471.0,8185.0,9899.0,6012075,274,99,142.75,186.5,230.25,1,0,0.25,0.5,0.75,3.333333,0.066819,0.0,0.0,1.0,0.0,1076.333374,0.559587,0.0,0.0,20.0,0.0,7977.0,0.395925,0.0,0.0,20.0,0.0,198.333328,0.492236,0.0,0.0,8.333333,0.0,0.0,50096.0,0.009981,0.909534,0.009078,3,0.058824,0.0,0.0,1.0,0.0,1206,0.6,0.0,0.0,20.0,0.0,8409,0.396352,0.0,0.0,20.0,0.0,210,0.504808,0.0,0.0,5.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.005258,0.965104,0.005075,2.164471,0.007313,0.015829,56.85997,0.141443,22,0.10851
24473,253,19,27,0,45,4,0.053333,0.479507,0.074502,0.403369,0.361315,2,0.0,0.042553,1,0,0.0,1077,0.0,0.363238,20,0,0.0,60,0.0,0.06079,2,0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,2367014,412107515,149323330,113.500099,1225.643188,981.758606,50362,2.414896,0.033727,0.033727,0.044767,7,1,2.5,4.0,5.5,2312,808,1184.0,1560.0,1936.0,97,39,53.5,68.0,82.5,8768245,1,0,0.25,0.5,0.75,0,0,0.0,0.0,0.0,2.666667,0.055769,0.0,0.0,1.0,0.0,1373.333374,0.493932,0.0,0.0,20.0,0.0,67.0,0.077078,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50362.0,0.003971,0.837219,0.003325,2,0.043478,0.0,0.0,1.0,0.0,1209,0.41575,0.0,0.0,20.0,0.0,46,0.0625,0.0,0.0,4.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.003409,0.943731,0.003217,2.158226,0.004601,0.009931,40.775715,0.067454,20,0.235671
550382,4466,59,32,0,19,9,0.512281,0.320258,0.972023,0.396296,0.342893,8,0.0,0.186047,2,0,0.0,857,0.0,0.235893,19,0,0.0,3777,0.0,0.738128,20,0,0.0,73,0.0,0.613445,13,0,0.0,0,0.0,0,0,0,0.0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.030635,40,8,16.0,24.0,32.0,1402,519,739.75,960.5,1181.25,7711,2148,3538.75,4929.5,6320.25,9671012,114,20,43.5,67.0,90.5,1,0,0.25,0.5,0.75,20.0,0.4677,0.0,0.0,3.0,0.0,911.0,0.274987,0.0,0.0,20.0,0.0,3557.333252,0.740831,0.0,0.0,18.0,0.0,76.0,0.670796,0.0,0.0,8.666667,0.0,0.333333,53347.0,0.014996,0.377978,0.005668,14,0.333333,0.0,0.0,2.0,0.0,1065,0.30816,0.0,0.0,20.0,0.0,4516,0.919007,0.0,0.0,20.0,0.0,114,0.974359,0.0,0.0,12.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.024775,0.724924,0.01796,1.463167,0.014051,0.020559,29.314615,0.385356,8,0.030264
281831,2703,11,16,0,30,5,0.798561,0.169306,1.517999,0.429582,0.51791,97,1.0,1.979592,13,0,0.0,374,0.0,0.214204,20,0,0.0,6847,1.0,1.863129,20,0,0.0,70,0.0,0.933333,10,0,0.0,0,0.0,0,0,0,0.0,2173738,241470089,77456052,104.232376,1281.326782,1654.606812,44362,2.127191,0.057274,0.057274,0.043519,113,1,29.0,57.0,85.0,734,311,416.75,522.5,628.25,10108,3862,5423.5,6985.0,8546.5,4927961,112,48,64.0,80.0,96.0,2,0,0.5,1.0,1.5,105.333336,2.209153,2.0,0.0,7.666667,0.0,382.0,0.235261,0.0,0.0,20.0,0.0,6914.666504,2.335464,1.0,0.0,20.0,0.0,81.333336,1.302536,0.666667,0.0,12.666667,0.0,1.333333,44362.0,0.218656,0.843064,0.184341,105,2.142857,2.0,0.0,7.0,0.0,311,0.182834,0.0,0.0,20.0,0.0,6071,1.906122,1.0,0.0,20.0,0.0,79,1.215385,0.0,0.0,12.0,0.0,1,1.0,1.0,1.0,1.0,0.0,0.265756,0.787143,0.209187,0.761451,0.067706,0.051555,17.812141,6.013746,22,0.012943
923036,7781,49,20,0,31,9,0.092807,0.218916,0.077158,0.392586,0.473518,16,0.0,0.32,16,0,0.0,482,0.0,0.227251,20,0,0.0,340,0.0,0.074725,20,0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,2504800,300603750,106253616,120.107048,2057.361084,976.002563,50096,2.402141,0.047148,0.047148,0.036321,16,1,4.75,8.5,12.25,920,292,449.0,606.0,763.0,675,171,297.0,423.0,549.0,6012075,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,483.333344,0.250844,0.0,0.0,20.0,0.0,293.333344,0.066597,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50096.0,0.031939,0.962153,0.03073,0,0.0,0.0,0.0,0.0,0.0,539,0.268159,0.0,0.0,20.0,0.0,305,0.065719,0.0,0.0,20.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.944703,0.0,0.999492,0.003533,0.003531,25.41254,0.0,11,0.413793
2058583,16937,14,17,0,55,6,0.264059,0.296793,0.312592,0.435613,0.527369,20,0.0,0.408163,3,0,0.0,575,0.0,0.314724,20,0,0.0,7441,0.0,0.388381,20,0,0.0,95,0.0,0.242967,6,0,0.0,0,0.0,0,0,0,0.0,2331175,276254895,86919525,111.781593,889.815247,48.918354,47575,2.281257,0.054735,0.054735,0.043519,25,4,9.25,14.5,19.75,1272,444,651.0,858.0,1065.0,11613,4757,6471.0,8185.0,9899.0,5637855,164,56,83.0,110.0,137.0,1,0,0.25,0.5,0.75,19.0,0.39059,0.0,0.0,3.333333,0.0,551.666687,0.327551,0.0,0.0,20.0,0.0,6342.666504,0.358625,0.0,0.0,19.666666,0.0,75.666664,0.208258,0.0,0.0,6.333333,0.0,0.0,47575.0,0.042039,1.208618,0.050809,21,0.428571,0.0,0.0,3.0,0.0,576,0.329897,0.0,0.0,20.0,0.0,6497,0.360304,0.0,0.0,20.0,0.0,68,0.184783,0.0,0.0,6.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.047338,1.298409,0.061464,1.368098,0.023664,0.032375,31.527094,1.149425,22,0.018829
501309,4181,52,14,7,75,3,6.138142,0.325427,0.666828,0.50963,0.632368,400,6.0,8.333333,20,0,0.0,789,0.0,0.486737,20,0,0.0,1123,0.0,1.01721,20,0,0.0,10,0.0,0.434783,7,0,0.0,7,7.0,7,7,7,0.0,2364096,236452752,79837492,113.360184,1040.741699,1768.798706,49252,2.36167,0.06169,0.06169,0.043519,488,179,256.25,333.5,410.75,1600,587,840.25,1093.5,1346.75,1382,539,749.75,960.5,1171.25,4926099,17,4,7.25,10.5,13.75,7,0,1.75,3.5,5.25,443.5,9.641304,8.0,0.0,20.0,0.0,686.0,0.443018,0.0,0.0,20.0,0.0,1265.0,1.195652,0.0,0.0,20.0,0.0,13.5,0.586957,0.0,0.0,5.5,0.0,5.0,49252.0,0.81215,0.271339,0.220368,399,8.673913,7.0,0.0,20.0,0.0,695,0.442393,0.0,0.0,20.0,0.0,1148,1.085066,0.0,0.0,20.0,0.0,17,0.73913,0.0,0.0,7.0,0.0,7,7.0,7.0,7.0,7.0,0.0,0.895643,0.088981,0.079695,1.489296,0.515687,0.76801,42.874771,24.614435,22,0.005439


In [28]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [145]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [30]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [53]:
features = [
    
#'item_id', 'shop_id', 'date_block_num', 
     #  'item_category_id', 'month', 
    
    'item_id_mean_encoding',
       'shop_id_mean_encoding', 
    'item_category_id_mean_encoding',
       'month_mean_encoding', 
    #'date_block_num_mean_encoding',
    

       'item_block_units_rolling_3', 'item_block_mean_rolling_3',
       'item_block_median_rolling_3', 'item_block_min_rolling_3',
       'item_block_max_rolling_3', 'item_block_std_rolling_3',
    
       'shop_block_units_rolling_3', 'shop_block_mean_rolling_3',
       'shop_block_median_rolling_3', 'shop_block_min_rolling_3',
       'shop_block_max_rolling_3', 'shop_block_std_rolling_3',
    
       'cat_block_units_rolling_3', 'cat_block_mean_rolling_3',
       'cat_block_median_rolling_3', 'cat_block_min_rolling_3',
       'cat_block_max_rolling_3', 'cat_block_std_rolling_3',
    
       'shop_cat_block_units_rolling_3', 'shop_cat_block_mean_rolling_3',
       'shop_cat_block_median_rolling_3', 'shop_cat_block_min_rolling_3',
       'shop_cat_block_max_rolling_3', 'shop_cat_block_std_rolling_3',
    
       'shop_item_block_mean_rolling_3',
    
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'item_block_median_lag_1', 'item_block_min_lag_1',
       'item_block_max_lag_1', 'item_block_std_lag_1',
    
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'shop_block_median_lag_1', 'shop_block_min_lag_1',
       'shop_block_max_lag_1', 'shop_block_std_lag_1',
    
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'cat_block_median_lag_1', 'cat_block_min_lag_1',
       'cat_block_max_lag_1', 'cat_block_std_lag_1',
    
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_cat_block_median_lag_1', 'shop_cat_block_min_lag_1',
       'shop_cat_block_max_lag_1', 'shop_cat_block_std_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'shop_item_block_median_lag_1', 'shop_item_block_min_lag_1',
       'shop_item_block_max_lag_1', 'shop_item_block_std_lag_1',
    
       'item_share_block_lag_1', 'shop_share_block_lag_1', 'comp2_lag_1',
       'shop_share', 'item_share', 
   'shop_block_units_lag_comp1',
       'item_block_units_lag_comp1', 'blocks_without_sales',
       'item_mean_day_between_activity'


]

In [146]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',

       'shop_block_mean_rolling_3',

           'shop_cat_block_mean_rolling_3',
               'shop_cat_block_median_rolling_3',

      'item_block_mean_lag_1',

        'shop_block_mean_lag_1',

            'shop_cat_block_mean_lag_1',
               # 'shop_cat_block_median_lag_1',


    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',

    'shop_item_block_mean_lag_1',

    
#'item_id_mean_encoding',
       #'shop_id_mean_encoding',
    'item_category_id_mean_encoding',  
    #'month_mean_encoding', 'date_block_num_mean_encoding'
    
    'shop_share',
    
    #'item_mean_day_between_activity',
    #'comp1'

]




In [138]:
features = ['item_id_mean_encoding',
 'shop_id_mean_encoding',
 'month_mean_encoding',
 'item_block_units_rolling_3',
 'item_block_mean_rolling_3',
 'item_block_max_rolling_3',
 'shop_cat_block_mean_rolling_3',
 #'shop_item_block_mean_rolling_3',
 'shop_cat_block_mean_lag_1',
 'shop_cat_block_median_lag_1',
 #'shop_item_block_units_lag_1',
 #'shop_item_block_mean_lag_1',
 #'shop_item_block_median_lag_1',
# 'shop_item_block_min_lag_1',
 #'shop_item_block_max_lag_1',
 'item_share_block_lag_1',
 'shop_share',
 'item_share',
 'item_mean_day_between_activity']

In [156]:
cb_model = CatBoostRegressor(iterations=6000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 3,
                             #border_count=32, #number of splits for num features (default 128 on GPU)
                             bagging_temperature = 20, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             #random_strength = 100,#default 1  adds randomness to the split score
                             #depth=8,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             #cat_features=[0],
                            #learning_rate=0.001, #default is 0.03
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6493981	test: 1.3244779	best: 1.3244779 (0)	total: 20.7ms	remaining: 2m 3s
1:	learn: 1.6313728	test: 1.3130638	best: 1.3130638 (1)	total: 43.3ms	remaining: 2m 9s
2:	learn: 1.6136515	test: 1.3022564	best: 1.3022564 (2)	total: 63.7ms	remaining: 2m 7s
3:	learn: 1.5955075	test: 1.2912316	best: 1.2912316 (3)	total: 85.5ms	remaining: 2m 8s
4:	learn: 1.5805589	test: 1.2802674	best: 1.2802674 (4)	total: 107ms	remaining: 2m 8s
5:	learn: 1.5643273	test: 1.2710383	best: 1.2710383 (5)	total: 130ms	remaining: 2m 9s
6:	learn: 1.5496325	test: 1.2612003	best: 1.2612003 (6)	total: 152ms	remaining: 2m 10s
7:	learn: 1.5361445	test: 1.2528600	best: 1.2528600 (7)	total: 175ms	remaining: 2m 11s
8:	learn: 1.5228613	test: 1.2458309	best: 1.2458309 (8)	total: 197ms	remaining: 2m 11s
9:	learn: 1.5094367	test: 1.2377000	best: 1.2377000 (9)	total: 219ms	remaining: 2m 11s
10:	learn: 1.4948939	test: 1.2284582	best: 1.2284582 (10)	total: 242ms	remaining: 2m 11s
11:	learn: 1.4835247	test: 1.2210336	best: 

101:	learn: 1.1774600	test: 1.0422950	best: 1.0422950 (101)	total: 2.28s	remaining: 2m 11s
102:	learn: 1.1768343	test: 1.0419913	best: 1.0419913 (102)	total: 2.3s	remaining: 2m 11s
103:	learn: 1.1759930	test: 1.0416629	best: 1.0416629 (103)	total: 2.32s	remaining: 2m 11s
104:	learn: 1.1755487	test: 1.0416239	best: 1.0416239 (104)	total: 2.35s	remaining: 2m 11s
105:	learn: 1.1751319	test: 1.0413758	best: 1.0413758 (105)	total: 2.37s	remaining: 2m 11s
106:	learn: 1.1748395	test: 1.0413539	best: 1.0413539 (106)	total: 2.39s	remaining: 2m 11s
107:	learn: 1.1744898	test: 1.0409730	best: 1.0409730 (107)	total: 2.41s	remaining: 2m 11s
108:	learn: 1.1739344	test: 1.0407415	best: 1.0407415 (108)	total: 2.44s	remaining: 2m 11s
109:	learn: 1.1736687	test: 1.0405878	best: 1.0405878 (109)	total: 2.46s	remaining: 2m 11s
110:	learn: 1.1734467	test: 1.0405209	best: 1.0405209 (110)	total: 2.48s	remaining: 2m 11s
111:	learn: 1.1730431	test: 1.0403115	best: 1.0403115 (111)	total: 2.5s	remaining: 2m 11s
1

193:	learn: 1.1461467	test: 1.0297612	best: 1.0297612 (193)	total: 4.33s	remaining: 2m 9s
194:	learn: 1.1459229	test: 1.0297348	best: 1.0297348 (194)	total: 4.36s	remaining: 2m 9s
195:	learn: 1.1458853	test: 1.0296892	best: 1.0296892 (195)	total: 4.38s	remaining: 2m 9s
196:	learn: 1.1457980	test: 1.0296997	best: 1.0296892 (195)	total: 4.4s	remaining: 2m 9s
197:	learn: 1.1455346	test: 1.0296152	best: 1.0296152 (197)	total: 4.43s	remaining: 2m 9s
198:	learn: 1.1453973	test: 1.0295508	best: 1.0295508 (198)	total: 4.45s	remaining: 2m 9s
199:	learn: 1.1453295	test: 1.0295507	best: 1.0295507 (199)	total: 4.47s	remaining: 2m 9s
200:	learn: 1.1451756	test: 1.0295893	best: 1.0295507 (199)	total: 4.49s	remaining: 2m 9s
201:	learn: 1.1450768	test: 1.0295519	best: 1.0295507 (199)	total: 4.51s	remaining: 2m 9s
202:	learn: 1.1448159	test: 1.0294663	best: 1.0294663 (202)	total: 4.54s	remaining: 2m 9s
203:	learn: 1.1447731	test: 1.0295187	best: 1.0294663 (202)	total: 4.56s	remaining: 2m 9s
204:	learn:

[('item_block_mean_lag_1', 21.079370485598993),
 ('shop_item_block_mean_lag_1', 20.308547209551687),
 ('shop_item_block_mean_rolling_3', 10.209869920560285),
 ('item_category_id_mean_encoding', 7.822759145367325),
 ('shop_cat_block_mean_lag_1', 7.394379603125442),
 ('item_block_mean_rolling_3', 7.00595342818318),
 ('item_category_id', 6.200784214745123),
 ('shop_cat_block_mean_rolling_3', 5.250466558528337),
 ('shop_share', 4.971008061015289),
 ('shop_item_share_of_shop_units_mean', 4.649825318718489),
 ('shop_cat_block_median_rolling_3', 2.0259988569350518),
 ('shop_block_mean_rolling_3', 1.6335935967594097),
 ('shop_block_mean_lag_1', 1.447443600911394)]

In [70]:
features = [item[0] for item in scores.items() if item[1] > 4]

In [157]:
#test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [158]:
test['month'] = 11

In [159]:
item_features = [ 
    'shop_item_share_of_shop_units_mean','item_id_mean_encoding','item_share','item_mean_day_between_activity'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')


In [160]:
shop_features = [
        'shop_id_mean_encoding','shop_share'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [161]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [172]:
shop_item_features = [
        'shop_item_share_of_shop_units_mean','shop_item_share_of_shop_units_mean'
]

merge_col = ['shop_id','item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')



In [163]:
month_features = [
        'month_mean_encoding'
]

merge_col = ['month']
cols=month_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [164]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_max')

test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')






item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
item_block_units 3
[['item_id', 'item_block_units_rolling_3']]
item_block_max 3
[['item_id', 'item_block_max_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_median 3
[['shop_id', 'item_category_id', 'shop_cat_block_median_rolling_3']]


In [165]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [166]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_max')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_share_block')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')




item_block_mean 1
item_block_max 1
item_share_block 1
shop_block_mean 1
shop_cat_block_mean 1
shop_cat_block_median 1


In [167]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')

shop_item_block_min 1
shop_item_block_max 1
shop_item_block_median 1
shop_item_block_units 1


In [168]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [169]:
test['date_block_num'] = 34

In [177]:
test.fillna(0, inplace=True)

In [173]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,month,shop_item_share_of_shop_units_mean_x,item_id_mean_encoding,item_share,item_mean_day_between_activity,shop_id_mean_encoding,shop_share,item_category_id_mean_encoding,shop_item_share_of_shop_units_mean_y,month_mean_encoding,item_block_mean_rolling_3,item_block_units_rolling_3,item_block_max_rolling_3,shop_block_mean_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_item_block_mean_rolling_3,item_block_mean_lag_1,item_block_max_lag_1,item_share_block_lag_1,shop_block_mean_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1,shop_item_block_min_lag_1,shop_item_block_max_lag_1,shop_item_block_median_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,date_block_num,shop_item_share_of_shop_units_mean
212383,21978,162530,56,44,11,0.042758,0.028986,0.00074,7.513889,0.447209,2.052476,0.0299,0.042758,0.540211,,,,0.32221,,,,,,,0.284613,,,,,,,,34,0.042758
122831,13080,121645,53,54,11,0.044603,0.067432,0.004519,0.248163,0.424917,1.925773,0.1792,0.044603,0.540211,0.039137,1.666667,1.666667,0.298138,0.0,0.0,0.0,0.023256,1.0,0.001875,0.307735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34,0.044603
84652,8801,114094,49,40,11,0.029407,0.125628,0.002794,0.183498,0.217389,0.999492,0.404912,0.029407,0.540211,0.132177,5.666667,1.0,0.189636,0.085832,0.0,0.0,0.209302,1.0,0.016871,0.149188,0.061265,0.0,0.0,0.0,0.0,0.0,0.0,34,0.029407
176723,17981,152746,59,37,11,0.056003,0.044728,0.001561,1.661765,0.321989,1.463167,0.280417,0.056003,0.540211,,,,0.245579,0.214552,0.0,,,,,0.235893,0.191919,0.0,,,,,,34,0.056003
185657,18916,90797,18,40,11,0.03635,0.030303,8.2e-05,999.0,0.377975,1.715012,0.404912,0.03635,0.540211,,,,0.314204,0.179741,0.0,,,,,0.346546,0.189723,0.0,,,,,,34,0.03635
28069,3108,68429,24,55,11,0.043519,1.301587,0.109366,0.008213,0.423885,1.922239,0.315593,0.043519,0.540211,0.271226,11.666667,1.666667,0.322687,0.218151,0.0,0.0,0.325581,2.0,0.026243,0.360033,0.223958,0.0,0.0,0.0,0.0,0.0,0.0,34,0.043519
152320,15411,144462,58,63,11,0.025056,0.125,0.000329,999.0,0.559214,2.549017,0.437777,0.025056,0.540211,,,,0.382358,0.330438,0.0,,,,,0.34572,0.284314,0.0,,,,,,34,0.025056
137539,13968,160248,56,55,11,0.033146,0.325,0.014462,0.030009,0.447209,2.052476,0.315593,0.033146,0.540211,0.216593,9.333333,2.0,0.32221,0.219964,0.0,0.333333,0.209302,2.0,0.016871,0.284613,0.157986,0.0,0.0,0.0,0.0,0.0,0.0,34,0.033146
127713,13290,169914,37,47,11,0.027142,0.276596,0.002547,0.218421,0.294127,1.345255,0.363151,0.027142,0.540211,0.241845,10.333333,2.0,0.252457,0.085906,0.0,0.0,0.209302,2.0,0.016871,0.249656,0.067961,0.0,0.0,0.0,0.0,0.0,0.0,34,0.027142
75908,7877,74285,21,7,11,0.03504,0.170455,0.009038,0.067368,0.482022,2.181562,0.429169,0.03504,0.540211,0.443245,19.0,4.0,0.423002,0.467305,0.0,0.666667,0.651163,5.0,0.052487,0.434077,0.263158,0.0,0.0,0.0,0.0,0.0,0.0,34,0.03504


In [178]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.08785541, 0.06263798, 0.25567532, ..., 0.12961565, 0.15522908,
       0.13115323])

In [179]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.38732796577243567
18.153668524842942


In [None]:
cb_preds[0:100]

In [180]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)



In [182]:
predictions = []


for i in range(10):

    cb_model = CatBoostRegressor(iterations=6000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 3,
                             #border_count=32, #number of splits for num features (default 128 on GPU)
                             bagging_temperature = 20, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             #random_strength = 100,#default 1  adds randomness to the split score
                             #depth=8,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             #cat_features=[0],
                            #learning_rate=0.001, #default is 0.03
                             random_seed = i)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


    cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=False)

    cb_preds = cb_model.predict(test[features])
    cb_preds.clip(0,20,out=cb_preds)
    predictions.append(cb_preds)

In [183]:
prediction = np.mean(predictions, axis=0)

In [184]:
print(np.mean(prediction))
print(np.max(prediction))

0.38936379395844056
16.943752498637696


In [185]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = prediction

submission.to_csv('submission.csv', index=False)