In [41]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [42]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [43]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [44]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [45]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [46]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [47]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [48]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [49]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [50]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [51]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [52]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
17405,6472,25,1.787234
1471,803,32,0.069767
43553,16586,33,0.136364
26682,10583,19,0.039216
35912,14163,29,0.046512
33686,13253,32,0.069767
32900,12990,32,0.116279
23624,9400,23,0.62
39415,15319,21,0.480769
27222,10697,14,0.041667


In [53]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols,as_index=False)['item_cnt_block'].median()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_median})
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='unsigned')
    del block_units_med
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols,as_index=False)['item_cnt_block'].max()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols,as_index=False)['item_cnt_block'].min()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols,as_index=False)['item_cnt_block'].std()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_stats(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_stats(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block




shop_block
cat_block
shop_cat_block
shop_item_block


In [54]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 54
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [55]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


training.fillna(0,inplace=True)

fold 1
fold 2
fold 3
fold 4
fold 5


In [56]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [57]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_median')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_min')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_max')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_std')

training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
item_block_median 3
item_block_min 3
item_block_max 3
item_block_std 3
shop_block_units 3
shop_block_mean 3
shop_block_median 3
shop_block_min 3
shop_block_max 3
shop_block_std 3
cat_block_units 3
cat_block_mean 3
cat_block_median 3
cat_block_min 3
cat_block_max 3
cat_block_std 3
shop_cat_block_units 3
shop_cat_block_mean 3
shop_cat_block_median 3
shop_cat_block_min 3
shop_cat_block_max 3
shop_cat_block_std 3


In [77]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')


shop_item_block_mean 3


In [58]:
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

training['block_total'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.sum)

training['item_share_block'] = training['item_block_units'] * 100 / training['block_total']
training['shop_share_block'] = training['shop_block_units'] * 100 / training['block_total']
training['comp2'] = training['item_share_block'] * training['shop_share_block']

In [59]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_median')                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_min')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_max')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

training = add_lags(training, ['shop_id','item_id','date_blocbk_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

item_block_units 1
item_block_mean 1
item_block_median 1
item_block_min 1
item_block_max 1
item_block_std 1
shop_block_units 1
shop_block_mean 1
shop_block_median 1
shop_block_min 1
shop_block_max 1
shop_block_std 1
cat_block_units 1
cat_block_mean 1
cat_block_median 1
cat_block_min 1
cat_block_max 1
cat_block_std 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_cat_block_median 1
shop_cat_block_min 1
shop_cat_block_max 1
shop_cat_block_std 1
shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [60]:
training = add_lags(training, ['item_id','date_block_num'], 'item_share_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_share_block')
training = add_lags(training, ['shop_id', 'item_id', 'date_block_num'], 'comp2')

item_share_block 1
shop_share_block 1
comp2 1


In [61]:
total_sum_shops = training.groupby('shop_id')['item_cnt_block'].sum().sum()
training['shop_share'] = training.groupby('shop_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_shops

total_sum_items = training.groupby('item_id')['item_cnt_block'].sum().sum()
training['item_share'] = training.groupby('item_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_items

training['comp1'] = training['shop_share'] * training['item_share']

In [62]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [63]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [158]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned')

training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [22]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_item_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_block_units_lag_comp1,item_block_units_lag_comp1
1738857,14627,37,15,1,55,4,0.192308,0.293685,0.313755,0.405477,0.472121,9,0.183673,413,0.242798,5821,0.327261,113,0.311295,1,1,1935990,218971298,67206510,92.832176,202.492828,1163.133545,39510,1.894534,0.058789,0.058789,0.043519,20,3,7.25,11.5,15.75,1356,413,648.75,884.5,1120.25,11613,4757,6471.0,8185.0,9899.0,4468802,252,88,129.0,170.0,211.0,1,0,0.25,0.5,0.75,9.333333,0.200181,542.0,0.344254,5858.333496,0.360054,113.333336,0.3248,0.0,9,0.1875,602,0.371376,6710,0.38831,143,0.397222,0,0.0,35.390945,0.529101
2529662,21623,16,29,0,38,6,0.136442,0.375315,0.22413,0.436372,0.37431,1,0.023256,925,0.29468,508,0.137372,11,0.127907,0,0,2177047,378130476,158924431,104.391045,1655.887817,1442.138672,50629,2.427699,0.031857,0.031857,0.046519,29,1,8.0,15.0,22.0,1823,536,857.75,1179.5,1501.25,1891,374,753.25,1132.5,1511.75,8793732,23,1,6.5,12.0,17.5,1,0,0.25,0.5,0.75,1.0,0.02176,901.666687,0.303593,963.0,0.250158,9.0,0.106647,0.0,1,0.022727,933,0.306807,739,0.195296,9,0.104651,0,0.0,29.722841,0.031857
2098437,17448,11,13,1,40,2,0.069091,0.167488,0.404183,0.513463,0.61695,2,0.043478,399,0.253978,3834,0.490281,28,0.164706,1,1,2049254,175652748,69986479,98.263275,184.804901,1296.435181,44549,2.136158,0.063654,0.063654,0.058954,6,2,3.0,4.0,5.0,734,311,416.75,522.5,628.25,9071,3834,5143.25,6452.5,7761.75,3818538,68,21,32.75,44.5,56.25,1,0,0.25,0.5,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.130435,323,0.211664,4900,0.700801,21,0.138158,0,0.0,20.560152,0.381922
176476,1725,44,28,2,55,5,0.539216,0.268486,0.314711,0.426522,0.368951,33,0.75,636,0.209142,5793,0.246553,89,0.166667,2,2,2188736,371728324,151271504,104.951538,1348.89978,1075.175049,49744,2.385262,0.032884,0.032884,0.029407,46,9,18.25,27.5,36.75,1106,539,680.75,822.5,964.25,11613,4757,6471.0,8185.0,9899.0,8448371,158,73,94.25,115.5,136.75,2,0,0.5,1.0,1.5,0.0,0.0,683.666687,0.238088,6997.0,0.297533,113.666664,0.224893,0.0,0,0.0,677,0.228331,6365,0.262962,122,0.236893,0,0.0,22.262413,0.0
583019,4679,52,15,0,55,4,0.318996,0.327244,0.313755,0.405477,0.472121,15,0.306122,587,0.345091,5821,0.327261,76,0.209366,0,0,1935990,218971298,67206510,92.832176,202.492828,1163.133545,39510,1.894534,0.058789,0.058789,0.043519,30,7,12.75,18.5,24.25,1600,587,840.25,1093.5,1346.75,11613,4757,6471.0,8185.0,9899.0,4468802,183,64,93.75,123.5,153.25,1,0,0.25,0.5,0.75,24.333334,0.521437,720.333313,0.457591,5858.333496,0.360054,91.0,0.260325,0.0,25,0.520833,789,0.486737,6710,0.38831,119,0.330556,0,0.0,46.38448,1.469724
556786,4548,27,18,0,75,7,0.159367,0.958422,0.67309,0.412353,0.489785,14,0.28,1978,1.026466,764,0.6112,24,0.96,0,0,2371450,309332150,91395683,113.712814,416.425415,263.553497,47429,2.274256,0.051894,0.051894,0.043519,14,3,5.75,8.5,11.25,4617,0,1154.25,2308.5,3462.75,1382,539,749.75,960.5,1171.25,6186643,63,0,15.75,31.5,47.25,2,0,0.5,1.0,1.5,8.666667,0.176871,1871.333374,1.062448,796.666687,0.686693,24.666666,1.043478,0.0,9,0.183673,1992,1.090312,751,0.666371,26,1.130435,0,0.0,103.373116,0.467047
2029821,16556,20,21,0,37,10,0.267677,0.484569,0.280191,0.400715,0.501584,2,0.038462,1287,0.558594,2423,0.279019,0,0.0,0,0,3112980,382022004,137928960,149.26973,1842.489502,435.390015,59865,2.870572,0.043403,0.043403,0.042499,90,2,24.0,46.0,68.0,1851,1287,1428.0,1569.0,1710.0,3796,1376,1981.0,2586.0,3191.0,7346577,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,8.0,0.158562,0.0,0.0,2232.333252,0.260857,0.0,0.0,0.0,4,0.08,0,0.0,2161,0.252749,0,0.0,0,0.0,0.0,0.173611
2016587,16362,3,19,0,57,8,0.093575,0.22094,0.111541,0.458883,0.555135,6,0.117647,615,0.30597,590,0.137722,0,0.0,0,0,2909805,378932448,114680550,139.527344,1694.342896,1380.080444,57055,2.73583,0.049751,0.049751,0.04189,9,1,3.0,5.0,7.0,933,343,490.5,638.0,785.5,656,302,390.5,479.0,567.5,7430048,3,0,0.75,1.5,2.25,0,0,0.0,0.0,0.0,5.666667,0.114422,461.333344,0.251649,541.0,0.133915,0.0,0.0,0.0,9,0.18,468,0.242865,539,0.126824,0,0.0,0,0.0,23.283583,0.447761
1582259,13315,42,31,1,47,8,0.04902,0.921651,0.363464,0.460347,0.38923,2,0.047619,3069,0.888021,1116,0.324042,145,1.768293,1,1,2373336,435573768,195291648,113.803246,291.39856,1126.504883,56508,2.709601,0.028935,0.028935,0.027142,6,1,2.25,3.5,4.75,4116,1392,2073.0,2754.0,3435.0,3049,18,775.75,1533.5,2291.25,10370804,221,0,55.25,110.5,165.75,1,0,0.25,0.5,0.75,0.0,0.0,2624.333252,0.829777,296.666656,0.213523,32.666668,0.855807,0.0,0,0.0,2702,0.815821,628,0.310737,84,1.787234,0,0.0,78.182869,0.0
1921952,16020,38,30,0,65,7,0.550777,0.41316,0.77496,0.413783,0.361932,18,0.418605,1220,0.368357,1737,0.594049,29,0.426471,0,0,2212522,407190478,170415648,106.092094,989.867615,1993.151123,51454,2.467258,0.030193,0.030193,0.043519,78,9,26.25,43.5,60.75,1979,597,942.5,1288.0,1633.5,3557,882,1550.75,2219.5,2888.25,9469546,56,6,18.5,31.0,43.5,2,0,0.5,1.0,1.5,17.0,0.382281,1001.666687,0.328369,1540.666626,0.541828,32.0,0.498428,0.0,18,0.418605,1048,0.333864,1797,0.614569,36,0.529412,0,0.0,31.642513,0.543478


In [66]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [159]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [25]:
features = [
    
'comp1', 'shop_block_units_lag_comp1',
       'item_block_units_lag_comp1', 'blocks_without_sales'


]

In [192]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',

       'shop_block_mean_rolling_3',

           'shop_cat_block_mean_rolling_3',
               'shop_cat_block_median_rolling_3',

      'item_block_mean_lag_1',

        'shop_block_mean_lag_1',

            'shop_cat_block_mean_lag_1',
               # 'shop_cat_block_median_lag_1',


    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',

    'shop_item_block_mean_lag_1',

    
#'item_id_mean_encoding',
       #'shop_id_mean_encoding',
    'item_category_id_mean_encoding',  
    #'month_mean_encoding', 'date_block_num_mean_encoding'
    
    'shop_share',
    
    #'item_mean_day_between_activity',
    #'comp1'

]




In [193]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 2,
                             bagging_temperature = 30,
                             cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6488486	test: 1.3259364	best: 1.3259364 (0)	total: 131ms	remaining: 2m 10s
1:	learn: 1.6348505	test: 1.3167626	best: 1.3167626 (1)	total: 224ms	remaining: 1m 51s
2:	learn: 1.6203359	test: 1.3079481	best: 1.3079481 (2)	total: 310ms	remaining: 1m 42s
3:	learn: 1.6055051	test: 1.2989143	best: 1.2989143 (3)	total: 395ms	remaining: 1m 38s
4:	learn: 1.5917240	test: 1.2898127	best: 1.2898127 (4)	total: 468ms	remaining: 1m 33s
5:	learn: 1.5797295	test: 1.2821910	best: 1.2821910 (5)	total: 560ms	remaining: 1m 32s
6:	learn: 1.5637565	test: 1.2715970	best: 1.2715970 (6)	total: 638ms	remaining: 1m 30s
7:	learn: 1.5518550	test: 1.2636233	best: 1.2636233 (7)	total: 719ms	remaining: 1m 29s
8:	learn: 1.5419327	test: 1.2578644	best: 1.2578644 (8)	total: 801ms	remaining: 1m 28s
9:	learn: 1.5293800	test: 1.2499690	best: 1.2499690 (9)	total: 889ms	remaining: 1m 28s
10:	learn: 1.5186718	test: 1.2442982	best: 1.2442982 (10)	total: 1.01s	remaining: 1m 30s
11:	learn: 1.5073096	test: 1.2357557	best

94:	learn: 1.2449306	test: 1.0838364	best: 1.0838364 (94)	total: 8.33s	remaining: 1m 19s
95:	learn: 1.2444544	test: 1.0836091	best: 1.0836091 (95)	total: 8.42s	remaining: 1m 19s
96:	learn: 1.2440656	test: 1.0835302	best: 1.0835302 (96)	total: 8.49s	remaining: 1m 19s
97:	learn: 1.2431925	test: 1.0832157	best: 1.0832157 (97)	total: 8.58s	remaining: 1m 18s
98:	learn: 1.2430495	test: 1.0831443	best: 1.0831443 (98)	total: 8.65s	remaining: 1m 18s
99:	learn: 1.2420874	test: 1.0827864	best: 1.0827864 (99)	total: 8.72s	remaining: 1m 18s
100:	learn: 1.2414657	test: 1.0826502	best: 1.0826502 (100)	total: 8.81s	remaining: 1m 18s
101:	learn: 1.2412045	test: 1.0824890	best: 1.0824890 (101)	total: 8.9s	remaining: 1m 18s
102:	learn: 1.2406852	test: 1.0821593	best: 1.0821593 (102)	total: 8.98s	remaining: 1m 18s
103:	learn: 1.2402823	test: 1.0818846	best: 1.0818846 (103)	total: 9.07s	remaining: 1m 18s
104:	learn: 1.2391301	test: 1.0813016	best: 1.0813016 (104)	total: 9.17s	remaining: 1m 18s
105:	learn: 

186:	learn: 1.1922892	test: 1.0535461	best: 1.0535461 (186)	total: 16.2s	remaining: 1m 10s
187:	learn: 1.1921191	test: 1.0535493	best: 1.0535461 (186)	total: 16.3s	remaining: 1m 10s
188:	learn: 1.1917403	test: 1.0532634	best: 1.0532634 (188)	total: 16.4s	remaining: 1m 10s
189:	learn: 1.1908273	test: 1.0528625	best: 1.0528625 (189)	total: 16.5s	remaining: 1m 10s
190:	learn: 1.1906755	test: 1.0527937	best: 1.0527937 (190)	total: 16.6s	remaining: 1m 10s
191:	learn: 1.1901603	test: 1.0525523	best: 1.0525523 (191)	total: 16.7s	remaining: 1m 10s
192:	learn: 1.1895670	test: 1.0522533	best: 1.0522533 (192)	total: 16.8s	remaining: 1m 10s
193:	learn: 1.1892668	test: 1.0521135	best: 1.0521135 (193)	total: 16.9s	remaining: 1m 10s
194:	learn: 1.1889321	test: 1.0519752	best: 1.0519752 (194)	total: 16.9s	remaining: 1m 9s
195:	learn: 1.1888502	test: 1.0519610	best: 1.0519610 (195)	total: 17s	remaining: 1m 9s
196:	learn: 1.1887666	test: 1.0519112	best: 1.0519112 (196)	total: 17.1s	remaining: 1m 9s
197:

[('shop_item_block_mean_lag_1', 25.353704635856005),
 ('item_block_mean_lag_1', 22.079856028168756),
 ('shop_item_block_mean_rolling_3', 8.939140133951609),
 ('shop_cat_block_mean_lag_1', 7.168816540882907),
 ('item_block_mean_rolling_3', 6.77270149145099),
 ('item_category_id_mean_encoding', 5.52645463218295),
 ('shop_cat_block_mean_rolling_3', 5.434471283406793),
 ('item_category_id', 4.621790508290209),
 ('shop_item_share_of_shop_units_mean', 3.8453033244605077),
 ('shop_share', 3.1918332077563094),
 ('shop_cat_block_median_rolling_3', 2.515872330575838),
 ('shop_block_mean_lag_1', 2.3666666642567655),
 ('shop_block_mean_rolling_3', 2.1833892187603747)]

In [168]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [198]:
#test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

ValueError: columns overlap but no suffix specified: Index(['item_category_id'], dtype='object')

In [199]:
item_features = [ 
    'shop_item_share_of_shop_units_mean','item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [200]:
shop_features = [
        'shop_id_mean_encoding','shop_share'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [209]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [211]:
shop_item_features = [
        'shop_item_share_of_shop_units_mean'#,'cat_me_real'
]

merge_col = ['shop_id','item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [202]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')



item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_median 3
[['shop_id', 'item_category_id', 'shop_cat_block_median_rolling_3']]


In [203]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [204]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [205]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [206]:
test.fillna(0, inplace=True)

In [207]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,shop_item_share_of_shop_units_mean_x,item_id_mean_encoding_x,shop_id_mean_encoding_x,item_category_id_mean_encoding_x,shop_item_share_of_shop_units_mean_y,item_id_mean_encoding_y,shop_id_mean_encoding_y,shop_share,item_category_id_mean_encoding_y,item_block_mean_rolling_3,shop_block_mean_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_item_block_mean_rolling_3,item_block_mean_lag_1,shop_block_mean_lag_1,shop_cat_block_mean_lag_1,shop_item_block_mean_lag_1
98079,10333,47001,31,37,0.034281,0.291262,1.688803,0.281616,0.034281,0.291262,1.688803,7.735218,0.281616,0.450149,1.29107,1.266054,1.0,1.333333,0.372093,1.304982,1.171717,1.0
72357,7452,171936,37,55,0.032415,0.216216,0.29564,0.314336,0.032415,0.216216,0.29564,1.345255,0.314336,0.170442,0.252457,0.190057,0.0,0.0,0.069767,0.249656,0.166667,0.0
140858,14333,168002,36,41,0.044582,0.089903,0.08209,0.237905,0.044582,0.089903,0.08209,0.026622,0.237905,0.053735,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0
8513,1184,152473,59,76,0.0,0.0,0.323335,0.076896,0.0,0.0,0.323335,1.463167,0.076896,0.0,0.245579,0.0,0.0,0.0,0.0,0.235893,0.0,0.0
151297,15321,71158,24,63,0.043519,0.180117,0.423623,0.435298,0.043519,0.180117,0.423623,1.922239,0.435298,0.061856,0.322687,0.192069,0.0,0.0,0.069767,0.360033,0.303922,0.0
151212,15318,64660,22,63,0.042331,0.083333,0.369496,0.435298,0.042331,0.083333,0.369496,1.687157,0.435298,0.069969,0.290314,0.136889,0.0,0.0,0.071429,0.311313,0.078431,0.0
81399,8472,17657,3,43,0.033623,0.123077,0.223541,0.102843,0.033623,0.123077,0.223541,1.028662,0.102843,0.046352,0.164302,0.0,0.0,0.0,0.047619,0.165428,0.0,0.0
112542,11915,125400,52,40,0.043858,0.096855,0.325926,0.405424,0.043858,0.096855,0.325926,1.489296,0.405424,0.062225,0.244885,0.195038,0.0,0.0,0.023256,0.279383,0.233202,0.0
121174,12857,23695,2,40,0.033727,0.0,0.251428,0.405424,0.033727,0.0,0.251428,1.162267,0.405424,0.0,0.213751,0.114837,0.0,0.0,0.0,0.210845,0.083004,0.0
187303,19094,129505,47,37,0.045575,0.168044,0.548924,0.281616,0.045575,0.168044,0.548924,2.484515,0.281616,0.113813,0.46235,0.208016,0.0,0.0,0.023256,0.466006,0.191919,0.0


In [212]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.09901975, 0.08508496, 0.22039503, ..., 0.11771626, 0.13681778,
       0.12052681])

In [213]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.37436667237762894
13.4131583307244


In [None]:
cb_preds[0:100]

In [214]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)

