In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [11]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [12]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
5858,2635,32,0.255814
29714,11655,26,1.23913
18564,7007,32,0.046512
18613,7019,31,0.380952
52793,20717,31,0.261905
41385,16046,33,0.295455
11450,4364,32,0.116279
41371,16045,27,0.042553
38581,15193,15,0.061224
31935,12563,28,0.113636


In [13]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols,as_index=False)['item_cnt_block'].median()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_median})
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='unsigned')
    del block_units_med
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols,as_index=False)['item_cnt_block'].max()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols,as_index=False)['item_cnt_block'].min()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols,as_index=False)['item_cnt_block'].std()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_stats(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_stats(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block




shop_block
cat_block
shop_cat_block
shop_item_block


In [14]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 54
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [15]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


training.fillna(0,inplace=True)

fold 1
fold 2
fold 3
fold 4
fold 5


In [16]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [17]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_median')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_min')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_max')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_std')

training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
item_block_median 3
item_block_min 3
item_block_max 3
item_block_std 3
shop_block_units 3
shop_block_mean 3
shop_block_median 3
shop_block_min 3
shop_block_max 3
shop_block_std 3
cat_block_units 3
cat_block_mean 3
cat_block_median 3
cat_block_min 3
cat_block_max 3
cat_block_std 3
shop_cat_block_units 3
shop_cat_block_mean 3
shop_cat_block_median 3
shop_cat_block_min 3
shop_cat_block_max 3
shop_cat_block_std 3


In [18]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')


shop_item_block_mean 3


In [19]:
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

training['block_total'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.sum)

training['item_share_block'] = training['item_block_units'] * 100 / training['block_total']
training['shop_share_block'] = training['shop_block_units'] * 100 / training['block_total']
training['comp2'] = training['item_share_block'] * training['shop_share_block']

In [21]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_median')                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_min')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_max')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_std')

training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_median')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_min')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_max')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

item_block_units 1
item_block_mean 1
item_block_median 1
item_block_min 1
item_block_max 1
item_block_std 1
shop_block_units 1
shop_block_mean 1
shop_block_median 1
shop_block_min 1
shop_block_max 1
shop_block_std 1
cat_block_units 1
cat_block_mean 1
cat_block_median 1
cat_block_min 1
cat_block_max 1
cat_block_std 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_cat_block_median 1
shop_cat_block_min 1
shop_cat_block_max 1
shop_cat_block_std 1
shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [22]:
training = add_lags(training, ['item_id','date_block_num'], 'item_share_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_share_block')
training = add_lags(training, ['shop_id', 'item_id', 'date_block_num'], 'comp2')

item_share_block 1
shop_share_block 1
comp2 1


In [23]:
total_sum_shops = training.groupby('shop_id')['item_cnt_block'].sum().sum()
training['shop_share'] = training.groupby('shop_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_shops

total_sum_items = training.groupby('item_id')['item_cnt_block'].sum().sum()
training['item_share'] = training.groupby('item_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_items

training['comp1'] = training['shop_share'] * training['item_share']

In [24]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [25]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [26]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned')

training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [27]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


shop_cat_models = {}

for shop_id in all_shop_ids:
    shop_cat_models[shop_id] = {}
    for cat_id in training['item_category_id'].unique():
    
        shop_cat_data = training[(training['shop_id'] == shop_id) & (training['item_category_id'] == cat_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
        if len(shop_cat_data) == 0:
            continue

        regr = linear_model.Ridge()

        X = shop_cat_data['date_block_num'].values.reshape(len(shop_cat_data),1)
        y = shop_cat_data['item_cnt_block'].values.reshape(len(shop_cat_data),1)
        
        #y = MinMaxScaler().fit_transform(y)

        # Train the model using the training sets
        regr.fit(X, y)
        shop_cat_models[shop_id][cat_id] = regr
            

    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(shop_id, cat_id, dbn):
    return shop_cat_models[shop_id][cat_id].predict([[dbn]])[0][0]

training['shop_cat_pred'] = training.apply(lambda row: predict(row['shop_id'],row['item_category_id'], row['date_block_num']), axis=1)

applying


In [28]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


item_models = {}

for item_id in training['item_id'].unique():
    
    item_data = training[(training['item_id'] == item_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
    if len(item_data) == 0:
        continue

    regr = linear_model.Ridge()

    X = item_data['date_block_num'].values.reshape(len(item_data),1)
    y = item_data['item_cnt_block'].values.reshape(len(item_data),1)

    #y = MinMaxScaler().fit_transform(y)

    # Train the model using the training sets
    regr.fit(X, y)
    item_models[item_id] = regr


    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(item_id, dbn):
    if item_id in item_models:
        return item_models[item_id].predict([[dbn]])[0][0]

training['item_pred'] = training.apply(lambda row: predict(row['item_id'], row['date_block_num']), axis=1)

applying


In [29]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_median,item_block_mean,item_block_max,item_block_min,item_block_std,shop_block_units,shop_block_median,shop_block_mean,shop_block_max,shop_block_min,shop_block_std,cat_block_units,cat_block_median,cat_block_mean,cat_block_max,cat_block_min,cat_block_std,shop_cat_block_units,shop_cat_block_median,shop_cat_block_mean,shop_cat_block_max,shop_cat_block_min,shop_cat_block_std,shop_item_block_units,shop_item_block_median,shop_item_block_mean,shop_item_block_max,shop_item_block_min,shop_item_block_std,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,item_block_median_rolling_3,item_block_min_rolling_3,item_block_max_rolling_3,item_block_std_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,shop_block_median_rolling_3,shop_block_min_rolling_3,shop_block_max_rolling_3,shop_block_std_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,cat_block_median_rolling_3,cat_block_min_rolling_3,cat_block_max_rolling_3,cat_block_std_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_cat_block_min_rolling_3,shop_cat_block_max_rolling_3,shop_cat_block_std_rolling_3,shop_item_block_mean_rolling_3,block_total,item_share_block,shop_share_block,comp2,item_block_units_lag_1,item_block_mean_lag_1,item_block_median_lag_1,item_block_min_lag_1,item_block_max_lag_1,item_block_std_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,shop_block_median_lag_1,shop_block_min_lag_1,shop_block_max_lag_1,shop_block_std_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,cat_block_median_lag_1,cat_block_min_lag_1,cat_block_max_lag_1,cat_block_std_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1,shop_cat_block_min_lag_1,shop_cat_block_max_lag_1,shop_cat_block_std_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_item_block_median_lag_1,shop_item_block_min_lag_1,shop_item_block_max_lag_1,shop_item_block_std_lag_1,item_share_block_lag_1,shop_share_block_lag_1,comp2_lag_1,shop_share,item_share,comp1,shop_block_units_lag_comp1,item_block_units_lag_comp1,blocks_without_sales,item_mean_day_between_activity,shop_cat_pred,item_pred
2494448,21137,20,33,0,61,10,0.085106,0.484878,0.195796,0.401477,0.334405,1,0,0.022727,1,0,0.0,1851,0,0.463794,20,0,0.0,1365,0,0.260695,20,0,0.0,401,1,3.369748,20,0,0.0,0,0,0,0,0,0.0,2573868,530591380,233461527,123.418907,728.626282,897.310242,58497,2.804975,0.025056,0.025056,0.027142,4,1,1.75,2.5,3.25,1851,1287,1428.0,1569.0,1710.0,1365,1,342.0,683.0,1024.0,12058895,401,167,225.5,284.0,342.5,0,0,0.0,0.0,0.0,3.5,0.082226,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,541.666687,0.188718,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58497.0,0.001709,0.923603,0.001579,4,0.093023,0,0,4,0,1287,0.558594,0,0,20,0,491,0.139251,0,0,8,0,167,4.175,3,0,14,0,0,0.0,0,0,0,0,0,1,0,0.257843,0.000657,0.000169,32.247559,0.100226,3,1.833333,399.39726,2.0
1377987,11498,4,29,0,37,6,0.337821,0.290225,0.279504,0.438036,0.37348,3,0,0.069767,1,0,0.0,666,0,0.212169,20,0,0.0,2167,0,0.284719,20,0,0.0,22,0,0.124294,2,0,0.0,0,0,0,0,0,0.0,2177047,378130476,158924431,104.391045,1655.887817,1442.138672,50629,2.427699,0.031857,0.031857,0.043519,37,3,11.5,20.0,28.5,1438,522,751.0,980.0,1209.0,3796,1376,1981.0,2586.0,3191.0,8793732,36,8,15.0,22.0,29.0,2,0,0.5,1.0,1.5,8.333333,0.181265,0.0,0.0,1.666667,0.0,681.333313,0.229463,0.0,0.0,20.0,0.0,2134.0,0.279663,0.0,0.0,19.666666,0.0,17.0,0.101615,0.0,0.0,2.333333,0.0,0.0,50629.0,0.005925,0.021016,0.000125,5,0.113636,0,0,2,0,681,0.223939,0,0,20,0,1637,0.220145,0,0,19,0,8,0.047337,0,0,2,0,0,0.0,0,0,0,0,0,0,0,1.32496,0.028677,0.037995,21.694807,0.159286,22,0.015108,21.110445,13.645644
858540,6952,18,29,0,31,6,0.060606,0.377221,0.075075,0.435371,0.375346,5,0,0.116279,5,0,0.0,999,0,0.318254,20,0,0.0,472,0,0.080711,20,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,2177047,378130476,158924431,104.391045,1655.887817,1442.138672,50629,2.427699,0.031857,0.031857,0.030071,5,1,2.0,3.0,4.0,1480,606,824.5,1043.0,1261.5,675,171,297.0,423.0,549.0,8793732,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1.5,0.033366,0.0,0.0,1.5,0.0,1019.0,0.343248,0.0,0.0,20.0,0.0,453.333344,0.071856,0.0,0.0,18.666666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50629.0,0.009876,0.678741,0.006703,2,0.045455,0,0,2,0,1004,0.330155,0,0,20,0,495,0.079787,0,0,20,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1.715012,0.001397,0.002396,31.984707,0.063715,7,0.980952,0.0,2.359606
2019374,16377,42,24,0,31,1,0.022624,0.916918,0.076239,0.565155,0.528563,1,0,0.02,1,0,0.0,2568,0,0.946554,20,0,0.0,641,0,0.094265,20,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,3586650,490180900,194611629,171.982574,850.382507,1093.897461,71733,3.439651,0.03686,0.03686,0.043287,1,1,1.0,1.0,1.0,4116,1392,2073.0,2754.0,3435.0,675,171,297.0,423.0,549.0,9803618,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1.0,0.021074,0.0,0.0,1.0,0.0,2860.666748,1.127768,0.0,0.0,20.0,0.0,484.0,0.075637,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71733.0,0.001394,0.839112,0.00117,1,0.020408,0,0,1,0,4116,1.514906,0,0,20,0,562,0.082647,0,0,20,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,4.181697,0.000493,0.002062,151.713974,0.03686,6,21.2,0.0,1.0
215317,2140,37,32,1,55,9,1.124088,0.296085,0.314365,0.39127,0.338399,10,0,0.232558,3,0,0.0,907,0,0.249656,20,0,0.0,4757,0,0.192062,10,0,0.0,96,0,0.166667,4,0,0.0,1,1,1,1,1,0.0,2293921,415853516,193809651,109.995239,1405.266724,1055.442017,53347,2.558029,0.027525,0.027525,0.036109,136,5,37.75,70.5,103.25,1356,413,648.75,884.5,1120.25,11613,4757,6471.0,8185.0,9899.0,9671012,252,88,129.0,170.0,211.0,4,0,1.0,2.0,3.0,12.333333,0.289406,0.0,0.0,2.666667,0.0,922.666687,0.277898,0.0,0.0,20.0,0.0,5802.333496,0.247697,0.0,0.0,15.666667,0.0,159.666672,0.291058,0.0,0.0,4.333333,0.0,0.333333,53347.0,0.018745,0.471704,0.008842,14,0.333333,0,0,2,0,1113,0.322049,0,0,20,0,5742,0.245448,0,0,13,0,144,0.258528,0,0,5,0,0,0.0,0,0,0,0,0,0,0,1.345255,0.061133,0.08224,30.635838,0.385356,15,0.012805,150.927703,-8.029893
1592282,13383,52,29,0,49,6,0.056581,0.327276,0.20648,0.437078,0.374134,5,0,0.116279,3,0,0.0,710,0,0.226187,18,0,0.0,590,0,0.221305,14,0,0.0,1,0,0.016129,1,0,0.0,0,0,0,0,0,0.0,2177047,378130476,158924431,104.391045,1655.887817,1442.138672,50629,2.427699,0.031857,0.031857,0.044335,9,1,3.0,5.0,7.0,1600,587,840.25,1093.5,1346.75,639,383,447.0,511.0,575.0,8793732,6,0,1.5,3.0,4.5,0,0,0.0,0.0,0.0,3.333333,0.072197,0.0,0.0,1.666667,0.0,808.0,0.272495,0.0,0.0,20.0,0.0,560.666687,0.205183,0.0,0.0,17.666666,0.0,3.666667,0.061638,0.0,0.0,2.0,0.0,0.0,50629.0,0.009876,0.107922,0.001066,2,0.045455,0,0,1,0,716,0.235449,0,0,20,0,431,0.166025,0,0,14,0,1,0.016949,0,0,1,0,0,0.0,0,0,0,0,0,0,0,1.489296,0.005998,0.008933,22.809811,0.063715,21,0.147319,2.281495,3.821197
316364,2921,41,12,3,21,1,0.693902,0.287891,0.485467,0.567823,0.640303,38,0,0.826087,4,0,0.0,862,0,0.564875,20,0,0.0,223,0,0.605978,4,0,0.0,7,0,0.875,3,0,0.0,3,3,3,3,3,0.0,2059098,183422884,68308338,98.735298,557.388733,1215.967041,44763,2.14642,0.065531,0.065531,0.043519,66,10,24.0,38.0,52.0,1175,528,689.75,851.5,1013.25,397,83,161.5,240.0,318.5,3987454,10,0,2.5,5.0,7.5,3,0,0.75,1.5,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44763.0,0.084892,0.461631,0.039189,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1.295708,0.058175,0.075378,0.0,0.0,22,0.007882,3.67605,21.900938
219249,2196,28,27,1,55,4,0.819648,1.113248,0.31477,0.40523,0.363275,10,0,0.212766,1,0,0.0,2587,0,0.872513,20,0,0.0,6365,0,0.262962,20,0,0.0,369,0,0.716505,8,0,0.0,1,1,1,1,1,0.0,2367014,412107515,149323330,113.500099,1225.643188,981.758606,50362,2.414896,0.033727,0.033727,0.039331,145,4,39.25,74.5,109.75,4524,1882,2542.5,3203.0,3863.5,11613,4757,6471.0,8185.0,9899.0,8768245,548,185,275.75,366.5,457.25,7,0,1.75,3.5,5.25,30.333334,0.628628,0.333333,0.0,3.333333,0.0,3560.666748,1.281519,0.0,0.0,20.0,0.0,7522.666504,0.318376,0.0,0.0,18.333334,0.0,486.333344,0.982165,0.666667,0.0,18.333334,0.0,2.0,50362.0,0.019856,1.232914,0.024481,21,0.456522,0,0,3,0,3114,1.070839,0,0,20,0,7806,0.334705,0,0,20,0,467,0.921105,1,0,20,0,2,2.0,2,2,2,0,0,0,0,5.070755,0.0576,0.292074,105.025291,0.708263,18,0.012339,380.231195,25.491818
822989,6624,14,16,0,22,5,0.261962,0.300891,0.45561,0.430599,0.523595,4,0,0.081633,1,0,0.0,576,0,0.329897,20,0,0.0,430,0,0.626822,20,0,0.0,2,0,0.142857,1,0,0.0,0,0,0,0,0,0.0,2173738,241470089,77456052,104.232376,1281.326782,1654.606812,44362,2.127191,0.057274,0.057274,0.043519,40,1,10.75,20.5,30.25,1272,444,651.0,858.0,1065.0,793,214,358.75,503.5,648.25,4927961,11,0,2.75,5.5,8.25,1,0,0.25,0.5,0.75,5.666667,0.119122,0.0,0.0,1.666667,0.0,575.0,0.354653,0.0,0.0,20.0,0.0,283.666656,0.569136,0.0,0.0,7.666667,0.0,1.666667,0.155556,0.0,0.0,1.333333,0.0,0.0,44362.0,0.009017,1.298409,0.011707,1,0.020408,0,0,1,0,444,0.261023,0,0,20,0,237,0.403061,0,0,4,0,2,0.166667,0,0,1,0,0,0.0,0,0,0,0,0,1,0,1.368098,0.022596,0.030914,25.429552,0.057274,22,0.023147,3.19556,9.680767
971714,8072,57,12,1,41,1,0.267218,0.81871,0.233434,0.562547,0.633956,73,1,1.586957,9,0,0.0,1700,0,1.114024,20,0,0.0,332,0,0.343685,13,0,0.0,11,0,0.52381,2,0,0.0,1,1,1,1,1,0.0,2059098,183422884,68308338,98.735298,557.388733,1215.967041,44763,2.14642,0.065531,0.065531,0.045094,73,1,19.0,37.0,55.0,4070,1645,2251.25,2857.5,3463.75,700,131,273.25,415.5,557.75,3987454,33,5,12.0,19.0,26.0,3,0,0.75,1.5,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44763.0,0.163081,0.869647,0.141823,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,3.73536,0.022103,0.082563,0.0,0.0,20,0.042129,12.758191,31.601276


In [30]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
       'shop_item_block_max', 'shop_item_block_min',
       'shop_item_block_std', 'item_units', 'cat_units', 'shop_units',
   

In [31]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [73]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',

       'shop_block_mean_rolling_3',

           'shop_cat_block_mean_rolling_3',
               'shop_cat_block_median_rolling_3',

      'item_block_mean_lag_1',

        'shop_block_mean_lag_1',

            'shop_cat_block_mean_lag_1',
               # 'shop_cat_block_median_lag_1',


    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',

    'shop_item_block_mean_lag_1',

    
#'item_id_mean_encoding',
       #'shop_id_mean_encoding',
    'item_category_id_mean_encoding',  
    #'month_mean_encoding', 'date_block_num_mean_encoding'
    
    'shop_share',
    
    #'item_mean_day_between_activity',
    #'shop_cat_pred',
    #'item_pred'

]




In [81]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 30,
                             bagging_temperature = 20,
                             random_strength = 30,
                             cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6489130	test: 1.3261997	best: 1.3261997 (0)	total: 131ms	remaining: 2m 11s
1:	learn: 1.6375571	test: 1.3175852	best: 1.3175852 (1)	total: 215ms	remaining: 1m 47s
2:	learn: 1.6269528	test: 1.3102201	best: 1.3102201 (2)	total: 308ms	remaining: 1m 42s
3:	learn: 1.6130762	test: 1.3015639	best: 1.3015639 (3)	total: 390ms	remaining: 1m 37s
4:	learn: 1.5992097	test: 1.2923679	best: 1.2923679 (4)	total: 463ms	remaining: 1m 32s
5:	learn: 1.5873579	test: 1.2851452	best: 1.2851452 (5)	total: 555ms	remaining: 1m 31s
6:	learn: 1.5719692	test: 1.2759251	best: 1.2759251 (6)	total: 642ms	remaining: 1m 31s
7:	learn: 1.5589368	test: 1.2682952	best: 1.2682952 (7)	total: 715ms	remaining: 1m 28s
8:	learn: 1.5463396	test: 1.2610204	best: 1.2610204 (8)	total: 792ms	remaining: 1m 27s
9:	learn: 1.5328381	test: 1.2529191	best: 1.2529191 (9)	total: 868ms	remaining: 1m 25s
10:	learn: 1.5238571	test: 1.2482715	best: 1.2482715 (10)	total: 944ms	remaining: 1m 24s
11:	learn: 1.5125054	test: 1.2398723	best

95:	learn: 1.2170354	test: 1.0673959	best: 1.0673959 (95)	total: 8.01s	remaining: 1m 15s
96:	learn: 1.2160230	test: 1.0669040	best: 1.0669040 (96)	total: 8.09s	remaining: 1m 15s
97:	learn: 1.2141725	test: 1.0655634	best: 1.0655634 (97)	total: 8.17s	remaining: 1m 15s
98:	learn: 1.2126361	test: 1.0646517	best: 1.0646517 (98)	total: 8.26s	remaining: 1m 15s
99:	learn: 1.2121369	test: 1.0644549	best: 1.0644549 (99)	total: 8.33s	remaining: 1m 14s
100:	learn: 1.2115187	test: 1.0642922	best: 1.0642922 (100)	total: 8.41s	remaining: 1m 14s
101:	learn: 1.2105985	test: 1.0638167	best: 1.0638167 (101)	total: 8.5s	remaining: 1m 14s
102:	learn: 1.2094013	test: 1.0631433	best: 1.0631433 (102)	total: 8.58s	remaining: 1m 14s
103:	learn: 1.2085075	test: 1.0626859	best: 1.0626859 (103)	total: 8.66s	remaining: 1m 14s
104:	learn: 1.2064357	test: 1.0616012	best: 1.0616012 (104)	total: 8.74s	remaining: 1m 14s
105:	learn: 1.2061504	test: 1.0615691	best: 1.0615691 (105)	total: 8.82s	remaining: 1m 14s
106:	learn

186:	learn: 1.1716807	test: 1.0431423	best: 1.0431423 (186)	total: 15.7s	remaining: 1m 8s
187:	learn: 1.1714338	test: 1.0429940	best: 1.0429940 (187)	total: 15.8s	remaining: 1m 8s
188:	learn: 1.1711036	test: 1.0428048	best: 1.0428048 (188)	total: 15.8s	remaining: 1m 7s
189:	learn: 1.1708982	test: 1.0426084	best: 1.0426084 (189)	total: 15.9s	remaining: 1m 7s
190:	learn: 1.1703309	test: 1.0422133	best: 1.0422133 (190)	total: 16s	remaining: 1m 7s
191:	learn: 1.1700984	test: 1.0420034	best: 1.0420034 (191)	total: 16.1s	remaining: 1m 7s
192:	learn: 1.1699501	test: 1.0420016	best: 1.0420016 (192)	total: 16.2s	remaining: 1m 7s
193:	learn: 1.1699136	test: 1.0420503	best: 1.0420016 (192)	total: 16.2s	remaining: 1m 7s
194:	learn: 1.1697016	test: 1.0418629	best: 1.0418629 (194)	total: 16.3s	remaining: 1m 7s
195:	learn: 1.1694771	test: 1.0418460	best: 1.0418460 (195)	total: 16.4s	remaining: 1m 7s
196:	learn: 1.1692857	test: 1.0417423	best: 1.0417423 (196)	total: 16.5s	remaining: 1m 7s
197:	learn: 

280:	learn: 1.1542697	test: 1.0350202	best: 1.0350202 (280)	total: 23.6s	remaining: 1m
281:	learn: 1.1541283	test: 1.0349875	best: 1.0349875 (281)	total: 23.7s	remaining: 1m
282:	learn: 1.1540902	test: 1.0349910	best: 1.0349875 (281)	total: 23.7s	remaining: 1m
283:	learn: 1.1540529	test: 1.0349627	best: 1.0349627 (283)	total: 23.8s	remaining: 1m
284:	learn: 1.1539605	test: 1.0349049	best: 1.0349049 (284)	total: 23.9s	remaining: 60s
285:	learn: 1.1538720	test: 1.0348507	best: 1.0348507 (285)	total: 24s	remaining: 59.9s
286:	learn: 1.1535094	test: 1.0348119	best: 1.0348119 (286)	total: 24.1s	remaining: 59.8s
287:	learn: 1.1534811	test: 1.0347840	best: 1.0347840 (287)	total: 24.2s	remaining: 59.7s
288:	learn: 1.1532322	test: 1.0346763	best: 1.0346763 (288)	total: 24.2s	remaining: 59.7s
289:	learn: 1.1529666	test: 1.0346124	best: 1.0346124 (289)	total: 24.3s	remaining: 59.6s
290:	learn: 1.1528997	test: 1.0345801	best: 1.0345801 (290)	total: 24.4s	remaining: 59.4s
291:	learn: 1.1528138	test

374:	learn: 1.1452432	test: 1.0320016	best: 1.0319611 (369)	total: 31.6s	remaining: 52.7s
375:	learn: 1.1451142	test: 1.0320519	best: 1.0319611 (369)	total: 31.7s	remaining: 52.6s
376:	learn: 1.1450596	test: 1.0320006	best: 1.0319611 (369)	total: 31.8s	remaining: 52.5s
377:	learn: 1.1450189	test: 1.0318950	best: 1.0318950 (377)	total: 31.9s	remaining: 52.5s
378:	learn: 1.1448048	test: 1.0318031	best: 1.0318031 (378)	total: 32s	remaining: 52.4s
379:	learn: 1.1447887	test: 1.0317934	best: 1.0317934 (379)	total: 32s	remaining: 52.3s
380:	learn: 1.1446324	test: 1.0317635	best: 1.0317635 (380)	total: 32.1s	remaining: 52.2s
381:	learn: 1.1445776	test: 1.0318444	best: 1.0317635 (380)	total: 32.3s	remaining: 52.3s
382:	learn: 1.1445310	test: 1.0319093	best: 1.0317635 (380)	total: 32.4s	remaining: 52.2s
383:	learn: 1.1445128	test: 1.0319436	best: 1.0317635 (380)	total: 32.5s	remaining: 52.1s
384:	learn: 1.1444493	test: 1.0318628	best: 1.0317635 (380)	total: 32.6s	remaining: 52s
385:	learn: 1.14

467:	learn: 1.1398514	test: 1.0303507	best: 1.0303064 (463)	total: 39.4s	remaining: 44.8s
468:	learn: 1.1398421	test: 1.0303429	best: 1.0303064 (463)	total: 39.5s	remaining: 44.7s
469:	learn: 1.1397852	test: 1.0303079	best: 1.0303064 (463)	total: 39.6s	remaining: 44.6s
470:	learn: 1.1397419	test: 1.0302577	best: 1.0302577 (470)	total: 39.6s	remaining: 44.5s
471:	learn: 1.1396962	test: 1.0302180	best: 1.0302180 (471)	total: 39.7s	remaining: 44.5s
472:	learn: 1.1396329	test: 1.0302084	best: 1.0302084 (472)	total: 39.8s	remaining: 44.4s
473:	learn: 1.1395927	test: 1.0301689	best: 1.0301689 (473)	total: 39.9s	remaining: 44.3s
474:	learn: 1.1395400	test: 1.0301596	best: 1.0301596 (474)	total: 40s	remaining: 44.2s
475:	learn: 1.1395062	test: 1.0300954	best: 1.0300954 (475)	total: 40.1s	remaining: 44.2s
476:	learn: 1.1394321	test: 1.0301012	best: 1.0300954 (475)	total: 40.2s	remaining: 44.1s
477:	learn: 1.1394137	test: 1.0301089	best: 1.0300954 (475)	total: 40.3s	remaining: 44s
478:	learn: 1.

[('shop_item_block_mean_lag_1', 23.380706715125925),
 ('item_block_mean_lag_1', 20.921305217958974),
 ('shop_item_block_mean_rolling_3', 9.807794253938711),
 ('shop_cat_block_mean_lag_1', 7.9618903386755795),
 ('item_block_mean_rolling_3', 7.795561500799492),
 ('item_category_id_mean_encoding', 6.424601943558458),
 ('item_category_id', 4.90508953283266),
 ('shop_share', 3.8512566375667836),
 ('shop_item_share_of_shop_units_mean', 3.8139328513966593),
 ('shop_cat_block_mean_rolling_3', 3.7108971879214594),
 ('shop_cat_block_median_rolling_3', 2.9219683251331574),
 ('shop_block_mean_lag_1', 2.7293899004699123),
 ('shop_block_mean_rolling_3', 1.775605594622237)]

In [None]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [38]:
#test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [52]:
item_features = [ 
    'shop_item_share_of_shop_units_mean','item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [40]:
shop_features = [
        'shop_id_mean_encoding','shop_share'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [41]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [42]:
shop_item_features = [
        'shop_item_share_of_shop_units_mean'#,'cat_me_real'
]

merge_col = ['shop_id','item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [43]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')



item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_median 3
[['shop_id', 'item_category_id', 'shop_cat_block_median_rolling_3']]


In [44]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [45]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [46]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [49]:
def predict(shop_id, cat_id):
    if shop_id in shop_cat_models and cat_id in shop_cat_models[shop_id]:
        return shop_cat_models[shop_id][cat_id].predict([[34]])[0][0]

test['shop_cat_pred'] = test.apply(lambda row: predict(row['shop_id'],row['item_category_id']), axis=1)

In [67]:
def predict(item_id):
    if item_id in item_models:
        return item_models[item_id].predict([[34]])[0][0]

test['item_pred'] = test.apply(lambda row: predict(row['item_id']), axis=1)

In [68]:
test.fillna(0, inplace=True)

In [None]:
test.sample(10)

In [82]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.10410043, 0.03894061, 0.26731994, ..., 0.14508408, 0.18311304,
       0.14675301])

In [83]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.38870223108779917
17.942064465265958


In [None]:
cb_preds[0:100]

In [84]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)

