In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['shop_item_cnt_block'] = training['shop_item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
ys = sales_train.groupby(['item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = training.merge(ys, on=['item_id', 'date_block_num'], how='left').fillna(0)

ys = sales_train.groupby(['shop_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_cnt_block"})

training = training.merge(ys, on=['shop_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"category_cnt_block"})


training = training.merge(ys, on=['item_category_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['shop_id', 'item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_category_cnt_block"})

training = training.merge(ys, on=['shop_id', 'item_category_id', 'date_block_num'], how='left').fillna(0)

In [16]:
a = training.groupby(['shop_id','date_block_num'])['shop_cnt_block'].sum()\
                        .reset_index()
a = a[a['shop_cnt_block'] == 0]
closed_shop_ids = a["shop_id"].unique()
print(closed_shop_ids)


#with pd.option_context('display.max_rows', 999):
    #print(a[a['item_cnt_block'] == 0])

training = training[~training['shop_id'].isin(closed_shop_ids)]

[]


In [12]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [13]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id"]




y_train = training["shop_item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['shop_item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1


fold 1
fold 2
fold 3
fold 4
fold 5


In [14]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols)[name].sum()\
                        .reset_index().rename(columns={name:name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols)[name].median()\
                        .reset_index().rename(columns={name:name_median})
    print(block_units_med.head())
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    #df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='unsigned')
    del block_units_med
    
    block_means = df.groupby(cols)[name].mean()\
                        .reset_index().rename(columns={name:name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    #df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols)[name].max()\
                        .reset_index().rename(columns={name:name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    #df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols)[name].min()\
                        .reset_index().rename(columns={name:name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    #df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols)[name].std()\
                        .reset_index().rename(columns={name:name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    #df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id'], 'item_cnt_block')
training = add_block_units_stats(training, ['shop_id'], 'shop_cnt_block')
training = add_block_units_stats(training, ['item_category_id'], 'category_cnt_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id'], 'shop_category_cnt_block')

item_cnt_block
   item_id  item_cnt_block_median
0       30                   13.0
1       31                   13.0
2       32                   30.0
3       33                   17.0
4       38                    4.0
shop_cnt_block
   shop_id  shop_cnt_block_median
0        2                  890.0
1        3                  745.0
2        4                  980.0
3        5                 1187.0
4        6                 2328.0
category_cnt_block
   item_category_id  category_cnt_block_median
0                 2                      355.0
1                 3                     1059.0
2                 5                      156.0
3                 6                      598.0
4                 7                      244.0
shop_category_cnt_block
   shop_id  item_category_id  shop_category_cnt_block_median
0        2                 2                             5.0
1        2                 3                            15.0
2        2                 5                          

In [18]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'shop_item_cnt_block',
       'item_category_id', 'item_cnt_block', 'shop_cnt_block',
       'category_cnt_block', 'shop_category_cnt_block', 'month',
       'item_id_mean_encoding', 'shop_id_mean_encoding',
       'item_category_id_mean_encoding', 'item_cnt_block_units',
       'item_cnt_block_median', 'item_cnt_block_mean',
       'item_cnt_block_max', 'item_cnt_block_min', 'item_cnt_block_std',
       'shop_cnt_block_units', 'shop_cnt_block_median',
       'shop_cnt_block_mean', 'shop_cnt_block_max', 'shop_cnt_block_min',
       'shop_cnt_block_std', 'category_cnt_block_units',
       'category_cnt_block_median', 'category_cnt_block_mean',
       'category_cnt_block_max', 'category_cnt_block_min',
       'category_cnt_block_std', 'shop_category_cnt_block_units',
       'shop_category_cnt_block_median', 'shop_category_cnt_block_mean',
       'shop_category_cnt_block_max', 'shop_category_cnt_block_min',
       'shop_category_cnt_block_std

In [19]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block')
training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block_mean')
training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block_median')
training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block_min')
training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block_max')
training = add_rolls(training, ['item_id','date_block_num'], 'item_cnt_block_std')

training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block_median')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block_min')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block_max')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_cnt_block_std')

training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block')
training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block_median')
training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block_min')
training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block_max')
training = add_rolls(training, ['item_category_id','date_block_num'], 'category_cnt_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_cnt_block 3
item_cnt_block_mean 3
item_cnt_block_median 3
item_cnt_block_min 3
item_cnt_block_max 3
item_cnt_block_std 3
shop_cnt_block 3
shop_cnt_block_mean 3
shop_cnt_block_median 3
shop_cnt_block_min 3
shop_cnt_block_max 3
shop_cnt_block_std 3
category_cnt_block 3
category_cnt_block_mean 3
category_cnt_block_median 3
category_cnt_block_min 3
category_cnt_block_max 3
category_cnt_block_std 3
shop_category_cnt_block 3
shop_category_cnt_block_mean 3
shop_category_cnt_block_median 3
shop_category_cnt_block_min 3
shop_category_cnt_block_max 3
shop_category_cnt_block_std 3


In [20]:
def add_lags(df, cols, name, lags = [1,3,6,12]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "units" in name:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_median')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_min')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_max')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_std')

training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_median')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_min')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_max')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_std')


item_cnt_block 1
item_cnt_block 3
item_cnt_block 6
item_cnt_block 12
item_cnt_block_mean 1
item_cnt_block_mean 3
item_cnt_block_mean 6
item_cnt_block_mean 12
item_cnt_block_median 1
item_cnt_block_median 3
item_cnt_block_median 6
item_cnt_block_median 12
item_cnt_block_min 1
item_cnt_block_min 3
item_cnt_block_min 6
item_cnt_block_min 12
item_cnt_block_max 1
item_cnt_block_max 3
item_cnt_block_max 6
item_cnt_block_max 12
item_cnt_block_std 1
item_cnt_block_std 3
item_cnt_block_std 6
item_cnt_block_std 12
shop_cnt_block 1
shop_cnt_block 3
shop_cnt_block 6
shop_cnt_block 12
shop_cnt_block_mean 1
shop_cnt_block_mean 3
shop_cnt_block_mean 6
shop_cnt_block_mean 12
shop_cnt_block_median 1
shop_cnt_block_median 3
shop_cnt_block_median 6
shop_cnt_block_median 12
shop_cnt_block_min 1
shop_cnt_block_min 3
shop_cnt_block_min 6
shop_cnt_block_min 12
shop_cnt_block_max 1
shop_cnt_block_max 3
shop_cnt_block_max 6
shop_cnt_block_max 12
shop_cnt_block_std 1
shop_cnt_block_std 3
shop_cnt_block_std 6
sh

In [21]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [22]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned')

training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [24]:
training = training.fillna(0)

In [25]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,shop_item_cnt_block,item_category_id,item_cnt_block,shop_cnt_block,category_cnt_block,shop_category_cnt_block,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,item_cnt_block_units,item_cnt_block_median,item_cnt_block_mean,item_cnt_block_max,item_cnt_block_min,item_cnt_block_std,shop_cnt_block_units,shop_cnt_block_median,shop_cnt_block_mean,shop_cnt_block_max,shop_cnt_block_min,shop_cnt_block_std,category_cnt_block_units,category_cnt_block_median,category_cnt_block_mean,category_cnt_block_max,category_cnt_block_min,category_cnt_block_std,shop_category_cnt_block_units,shop_category_cnt_block_median,shop_category_cnt_block_mean,shop_category_cnt_block_max,shop_category_cnt_block_min,shop_category_cnt_block_std,item_cnt_block_rolling_3,item_cnt_block_mean_rolling_3,item_cnt_block_median_rolling_3,item_cnt_block_min_rolling_3,item_cnt_block_max_rolling_3,item_cnt_block_std_rolling_3,shop_cnt_block_rolling_3,shop_cnt_block_mean_rolling_3,shop_cnt_block_median_rolling_3,shop_cnt_block_min_rolling_3,shop_cnt_block_max_rolling_3,shop_cnt_block_std_rolling_3,category_cnt_block_rolling_3,category_cnt_block_mean_rolling_3,category_cnt_block_median_rolling_3,category_cnt_block_min_rolling_3,category_cnt_block_max_rolling_3,category_cnt_block_std_rolling_3,shop_category_cnt_block_rolling_3,shop_category_cnt_block_mean_rolling_3,shop_category_cnt_block_median_rolling_3,shop_category_cnt_block_min_rolling_3,shop_category_cnt_block_max_rolling_3,shop_category_cnt_block_std_rolling_3,item_cnt_block_lag_1,item_cnt_block_lag_3,item_cnt_block_lag_6,item_cnt_block_lag_12,item_cnt_block_mean_lag_1,item_cnt_block_mean_lag_3,item_cnt_block_mean_lag_6,item_cnt_block_mean_lag_12,item_cnt_block_median_lag_1,item_cnt_block_median_lag_3,item_cnt_block_median_lag_6,item_cnt_block_median_lag_12,item_cnt_block_min_lag_1,item_cnt_block_min_lag_3,item_cnt_block_min_lag_6,item_cnt_block_min_lag_12,item_cnt_block_max_lag_1,item_cnt_block_max_lag_3,item_cnt_block_max_lag_6,item_cnt_block_max_lag_12,item_cnt_block_std_lag_1,item_cnt_block_std_lag_3,item_cnt_block_std_lag_6,item_cnt_block_std_lag_12,shop_cnt_block_lag_1,shop_cnt_block_lag_3,shop_cnt_block_lag_6,shop_cnt_block_lag_12,shop_cnt_block_mean_lag_1,shop_cnt_block_mean_lag_3,shop_cnt_block_mean_lag_6,shop_cnt_block_mean_lag_12,shop_cnt_block_median_lag_1,shop_cnt_block_median_lag_3,shop_cnt_block_median_lag_6,shop_cnt_block_median_lag_12,shop_cnt_block_min_lag_1,shop_cnt_block_min_lag_3,shop_cnt_block_min_lag_6,shop_cnt_block_min_lag_12,shop_cnt_block_max_lag_1,shop_cnt_block_max_lag_3,shop_cnt_block_max_lag_6,shop_cnt_block_max_lag_12,shop_cnt_block_std_lag_1,shop_cnt_block_std_lag_3,shop_cnt_block_std_lag_6,shop_cnt_block_std_lag_12,category_cnt_block_lag_1,category_cnt_block_lag_3,category_cnt_block_lag_6,category_cnt_block_lag_12,category_cnt_block_mean_lag_1,category_cnt_block_mean_lag_3,category_cnt_block_mean_lag_6,category_cnt_block_mean_lag_12,category_cnt_block_median_lag_1,category_cnt_block_median_lag_3,category_cnt_block_median_lag_6,category_cnt_block_median_lag_12,category_cnt_block_min_lag_1,category_cnt_block_min_lag_3,category_cnt_block_min_lag_6,category_cnt_block_min_lag_12,category_cnt_block_max_lag_1,category_cnt_block_max_lag_3,category_cnt_block_max_lag_6,category_cnt_block_max_lag_12,category_cnt_block_std_lag_1,category_cnt_block_std_lag_3,category_cnt_block_std_lag_6,category_cnt_block_std_lag_12,shop_category_cnt_block_lag_1,shop_category_cnt_block_lag_3,shop_category_cnt_block_lag_6,shop_category_cnt_block_lag_12,shop_category_cnt_block_mean_lag_1,shop_category_cnt_block_mean_lag_3,shop_category_cnt_block_mean_lag_6,shop_category_cnt_block_mean_lag_12,shop_category_cnt_block_median_lag_1,shop_category_cnt_block_median_lag_3,shop_category_cnt_block_median_lag_6,shop_category_cnt_block_median_lag_12,shop_category_cnt_block_min_lag_1,shop_category_cnt_block_min_lag_3,shop_category_cnt_block_min_lag_6,shop_category_cnt_block_min_lag_12,shop_category_cnt_block_max_lag_1,shop_category_cnt_block_max_lag_3,shop_category_cnt_block_max_lag_6,shop_category_cnt_block_max_lag_12,shop_category_cnt_block_std_lag_1,shop_category_cnt_block_std_lag_3,shop_category_cnt_block_std_lag_6,shop_category_cnt_block_std_lag_12,blocks_without_sales,item_mean_day_between_activity
1711089,14312,56,30,0,55,3.0,1491.0,6474.0,146.0,7,0.146429,0.45125,0.313486,7085,7.0,6.819057,15.0,1.0,3.96565,103511639,1655.0,1866.655348,3012.0,1194.0,570.897114,4124651413,9179.0,8647.99258,13786.0,4913.0,2273.897447,1986419,174.0,195.417511,334.0,93.0,63.912795,3.0,6.819057,7.0,1.0,15.0,3.96565,1372.666626,1866.655396,1655.0,1194.0,3012.0,570.897095,6510.666504,8647.992188,9179.0,4913.0,13786.0,2273.897461,140.0,195.417511,174.0,93.0,334.0,63.912796,1.0,5.0,7.0,11.0,6.819057,6.819057,6.819057,6.819057,7.0,7.0,7.0,7.0,1.0,1.0,1.0,1.0,15.0,15.0,15.0,15.0,3.96565,3.96565,3.96565,3.96565,1566.0,1237.0,2444.0,2361.0,1866.655396,1866.655396,1866.655396,1866.655396,1655.0,1655.0,1655.0,1655.0,1194.0,1194.0,1194.0,1194.0,3012.0,3012.0,3012.0,3012.0,570.897095,570.897095,570.897095,570.897095,6017.0,7162.0,9291.0,11371.0,8647.992188,8647.992188,8647.992188,8647.992188,9179.0,9179.0,9179.0,9179.0,4913.0,4913.0,4913.0,4913.0,13786.0,13786.0,13786.0,13786.0,2273.897461,2273.897461,2273.897461,2273.897461,147.0,138.0,236.0,276.0,195.417511,195.417511,195.417511,195.417511,174.0,174.0,174.0,174.0,93.0,93.0,93.0,93.0,334.0,334.0,334.0,334.0,63.912796,63.912796,63.912796,63.912796,0.0,0.045977
487704,4067,44,22,0,55,19.0,1199.0,9809.0,118.0,11,0.55178,0.269051,0.31459,18757,13.0,24.942819,89.0,5.0,26.601176,56372722,917.0,1016.585613,1788.0,619.0,308.505048,4124651413,9179.0,8647.99258,13786.0,4913.0,2273.897447,1405320,125.0,138.250861,300.0,77.0,48.787811,54.333332,24.94282,13.0,5.0,89.0,26.601175,1092.666626,1016.585632,917.0,619.0,1788.0,308.505035,9574.333008,8647.992188,9179.0,4913.0,13786.0,2273.897461,124.666664,138.250854,125.0,77.0,300.0,48.787811,30.0,89.0,0.0,0.0,24.94282,24.94282,0.0,0.0,13.0,13.0,0.0,0.0,5.0,5.0,0.0,0.0,89.0,89.0,0.0,0.0,26.601175,26.601175,0.0,0.0,1157.0,1082.0,1135.0,0.0,1016.585632,1016.585632,1016.585632,0.0,917.0,917.0,917.0,0.0,619.0,619.0,619.0,0.0,1788.0,1788.0,1788.0,0.0,308.505035,308.505035,308.505035,0.0,9179.0,11180.0,9714.0,0.0,8647.992188,8647.992188,8647.992188,0.0,9179.0,9179.0,9179.0,0.0,4913.0,4913.0,4913.0,0.0,13786.0,13786.0,13786.0,0.0,2273.897461,2273.897461,2273.897461,0.0,125.0,137.0,133.0,0.0,138.250854,138.250854,138.250854,0.0,125.0,125.0,125.0,0.0,77.0,77.0,77.0,0.0,300.0,300.0,300.0,0.0,48.787811,48.787811,48.787811,0.0,0.0,0.02096
2230089,18589,34,23,0,37,4.0,658.0,7582.0,36.0,12,0.090741,0.114094,0.279467,3089,4.0,4.522694,12.0,1.0,3.33259,18782975,420.0,413.16678,658.0,2.0,110.937749,935709260,5098.0,5241.158454,7704.0,2989.0,1403.828461,74694,29.0,25.685695,41.0,0.0,11.51151,4.666667,4.522694,4.0,1.0,12.0,3.33259,388.0,413.166779,420.0,2.0,658.0,110.937752,5076.666504,5241.158691,5098.0,2989.0,7704.0,1403.828491,25.333334,25.685694,29.0,0.0,41.0,11.51151,6.0,4.0,12.0,0.0,4.522694,4.522694,4.522694,0.0,4.0,4.0,4.0,0.0,1.0,1.0,1.0,0.0,12.0,12.0,12.0,0.0,3.33259,3.33259,3.33259,0.0,441.0,406.0,0.0,0.0,413.166779,413.166779,0.0,0.0,420.0,420.0,0.0,0.0,2.0,2.0,0.0,0.0,658.0,658.0,0.0,0.0,110.937752,110.937752,0.0,0.0,5185.0,4971.0,6189.0,0.0,5241.158691,5241.158691,5241.158691,0.0,5098.0,5098.0,5098.0,0.0,2989.0,2989.0,2989.0,0.0,7704.0,7704.0,7704.0,0.0,1403.828491,1403.828491,1403.828491,0.0,31.0,32.0,0.0,0.0,25.685694,25.685694,0.0,0.0,29.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,41.0,0.0,0.0,11.51151,11.51151,0.0,0.0,0.0,0.1875
1312686,10945,49,17,0,57,6.0,757.0,1025.0,0.0,6,0.101818,0.219615,0.11187,5565,5.0,5.356112,12.0,1.0,2.89878,42815457,753.0,772.103529,1270.0,542.0,179.222451,65978957,697.0,739.135798,1071.0,364.0,225.644211,0,0.0,0.0,0.0,0.0,0.0,8.333333,5.356112,5.0,1.0,12.0,2.89878,675.333313,772.103516,753.0,542.0,1270.0,179.222458,995.333313,739.135803,697.0,364.0,1071.0,225.644211,0.0,0.0,0.0,0.0,0.0,0.0,6.0,9.0,0.0,0.0,5.356112,5.356112,0.0,0.0,5.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,12.0,12.0,0.0,0.0,2.89878,2.89878,0.0,0.0,689.0,754.0,0.0,0.0,772.103516,772.103516,0.0,0.0,753.0,753.0,0.0,0.0,542.0,542.0,0.0,0.0,1270.0,1270.0,0.0,0.0,179.222458,179.222458,0.0,0.0,978.0,1071.0,0.0,0.0,739.135803,739.135803,0.0,0.0,697.0,697.0,0.0,0.0,364.0,364.0,0.0,0.0,1071.0,1071.0,0.0,0.0,225.644211,225.644211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070773
849719,6857,21,32,0,55,1.0,1900.0,4913.0,217.0,9,0.284856,0.483453,0.31459,13813,15.0,13.294514,27.0,1.0,8.440879,109162334,1897.0,1968.555966,3959.0,1603.0,477.128145,4124651413,9179.0,8647.99258,13786.0,4913.0,2273.897447,2014634,185.0,198.193212,358.0,131.0,47.993396,4.333333,13.294514,15.0,1.0,27.0,8.440879,1735.666626,1968.555908,1897.0,1603.0,3959.0,477.128143,6171.0,8647.992188,9179.0,4913.0,13786.0,2273.897461,159.666672,198.193207,185.0,131.0,358.0,47.993397,3.0,5.0,5.0,19.0,13.294514,13.294514,13.294514,13.294514,15.0,15.0,15.0,15.0,1.0,1.0,1.0,1.0,27.0,27.0,27.0,27.0,8.440879,8.440879,8.440879,8.440879,1622.0,1844.0,1687.0,1897.0,1968.555908,1968.555908,1968.555908,1968.555908,1897.0,1897.0,1897.0,1897.0,1603.0,1603.0,1603.0,1603.0,3959.0,3959.0,3959.0,3959.0,477.128143,477.128143,477.128143,477.128143,6022.0,6017.0,8881.0,8364.0,8647.992188,8647.992188,8647.992188,8647.992188,9179.0,9179.0,9179.0,9179.0,4913.0,4913.0,4913.0,4913.0,13786.0,13786.0,13786.0,13786.0,2273.897461,2273.897461,2273.897461,2273.897461,140.0,160.0,218.0,164.0,198.193207,198.193207,198.193207,198.193207,185.0,185.0,185.0,185.0,131.0,131.0,131.0,131.0,358.0,358.0,358.0,358.0,47.993397,47.993397,47.993397,47.993397,0.0,0.019996
2358849,19744,31,33,1,40,9.0,6112.0,7680.0,1349.0,10,0.261111,1.701176,0.404658,2190,9.0,10.186047,13.0,8.0,1.941382,434463379,7341.0,7834.80387,14610.0,5714.0,2010.687513,4011203932,12748.0,12613.253878,22065.0,6779.0,4076.87594,11653185,1689.0,1704.429574,2561.0,1079.0,387.378639,11.0,10.186047,9.0,8.0,13.0,1.941382,6068.666504,7834.803711,7341.0,5714.0,14610.0,2010.6875,8191.666504,12613.253906,12748.0,6779.0,22065.0,4076.875977,1247.666626,1704.429565,1689.0,1079.0,2561.0,387.378632,8.0,13.0,0.0,0.0,10.186047,10.186047,0.0,0.0,9.0,9.0,0.0,0.0,8.0,8.0,0.0,0.0,13.0,13.0,0.0,0.0,1.941382,1.941382,0.0,0.0,6505.0,5987.0,7341.0,8457.0,7834.803711,7834.803711,7834.803711,7834.803711,7341.0,7341.0,7341.0,7341.0,5714.0,5714.0,5714.0,5714.0,14610.0,14610.0,14610.0,14610.0,2010.6875,2010.6875,2010.6875,2010.6875,6779.0,9283.0,10683.0,13639.0,12613.253906,12613.253906,12613.253906,12613.253906,12748.0,12748.0,12748.0,12748.0,6779.0,6779.0,6779.0,6779.0,22065.0,22065.0,22065.0,22065.0,4076.875977,4076.875977,4076.875977,4076.875977,1079.0,1475.0,1767.0,1812.0,1704.429565,1704.429565,1704.429565,1704.429565,1689.0,1689.0,1689.0,1689.0,1079.0,1079.0,1079.0,1079.0,2561.0,2561.0,2561.0,2561.0,387.378632,387.378632,387.378632,387.378632,0.0,0.098516
1892203,15760,25,32,0,40,3.0,5893.0,6779.0,628.0,9,0.099502,1.35179,0.4056,2306,4.0,4.621242,12.0,1.0,3.102472,347432059,6135.0,6265.342885,12257.0,4675.0,1531.723253,4011203932,12748.0,12613.253878,22065.0,6779.0,4076.87594,6713951,953.0,982.002486,1669.0,628.0,254.619886,5.666667,4.621243,4.0,1.0,12.0,3.102472,4814.666504,6265.342773,6135.0,4675.0,12257.0,1531.723267,9033.333008,12613.253906,12748.0,6779.0,22065.0,4076.875977,800.666687,982.002502,953.0,628.0,1669.0,254.619888,4.0,5.0,2.0,0.0,4.621243,4.621243,4.621243,0.0,4.0,4.0,4.0,0.0,1.0,1.0,1.0,0.0,12.0,12.0,12.0,0.0,3.102472,3.102472,3.102472,0.0,4675.0,5093.0,6007.0,6158.0,6265.342773,6265.342773,6265.342773,6265.342773,6135.0,6135.0,6135.0,6135.0,4675.0,4675.0,4675.0,4675.0,12257.0,12257.0,12257.0,12257.0,1531.723267,1531.723267,1531.723267,1531.723267,8513.0,9304.0,13084.0,12748.0,12613.253906,12613.253906,12613.253906,12613.253906,12748.0,12748.0,12748.0,12748.0,6779.0,6779.0,6779.0,6779.0,22065.0,22065.0,22065.0,22065.0,4076.875977,4076.875977,4076.875977,4076.875977,665.0,953.0,1140.0,839.0,982.002502,982.002502,982.002502,982.002502,953.0,953.0,953.0,953.0,628.0,628.0,628.0,628.0,1669.0,1669.0,1669.0,1669.0,254.619888,254.619888,254.619888,254.619888,0.0,0.195122
859860,6955,11,31,0,31,3.0,442.0,724.0,0.0,8,0.078014,0.168475,0.0784,1376,2.0,3.698925,13.0,1.0,3.894941,33636521,572.0,606.577119,1062.0,428.0,161.877838,176494444,1056.0,1497.327157,4824.0,506.0,1076.301981,0,0.0,0.0,0.0,0.0,0.0,3.0,3.698925,2.0,1.0,13.0,3.894941,452.0,606.577148,572.0,428.0,1062.0,161.877838,906.0,1497.327148,1056.0,506.0,4824.0,1076.302002,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,1.0,0.0,3.698925,3.698925,3.698925,0.0,2.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,13.0,13.0,13.0,0.0,3.894941,3.894941,3.894941,0.0,449.0,466.0,572.0,646.0,606.577148,606.577148,606.577148,606.577148,572.0,572.0,572.0,572.0,428.0,428.0,428.0,428.0,1062.0,1062.0,1062.0,1062.0,161.877838,161.877838,161.877838,161.877838,688.0,1138.0,1394.0,967.0,1497.327148,1497.327148,1497.327148,1497.327148,1056.0,1056.0,1056.0,1056.0,506.0,506.0,506.0,506.0,4824.0,4824.0,4824.0,4824.0,1076.302002,1076.302002,1076.302002,1076.302002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.869281
890391,7220,30,14,1,22,35.0,2361.0,655.0,13.0,3,0.631331,0.479278,0.458572,30122,28.0,28.991338,55.0,7.0,11.304462,55750220,1968.0,1921.891202,2700.0,745.0,479.665024,11619480,527.0,596.023596,1376.0,302.0,260.273803,2110,11.0,9.336283,17.0,2.0,4.565787,52.0,28.991339,28.0,7.0,55.0,11.304462,2321.5,1921.891235,1968.0,745.0,2700.0,479.665039,727.5,596.023621,527.0,302.0,1376.0,260.273804,12.5,9.336283,11.0,2.0,17.0,4.565787,49.0,0.0,0.0,0.0,28.991339,0.0,0.0,0.0,28.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,55.0,0.0,0.0,0.0,11.304462,0.0,0.0,0.0,2260.0,0.0,0.0,0.0,1921.891235,0.0,0.0,0.0,1968.0,0.0,0.0,0.0,745.0,0.0,0.0,0.0,2700.0,0.0,0.0,0.0,479.665039,0.0,0.0,0.0,577.0,0.0,0.0,0.0,596.023621,0.0,0.0,0.0,527.0,0.0,0.0,0.0,302.0,0.0,0.0,0.0,1376.0,0.0,0.0,0.0,260.273804,0.0,0.0,0.0,11.0,0.0,0.0,0.0,9.336283,0.0,0.0,0.0,11.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,4.565787,0.0,0.0,0.0,0.0,0.008109
1684481,14203,43,21,0,31,1.0,1833.0,3035.0,0.0,10,0.0301,0.506958,0.077035,1219,1.0,1.610304,4.0,1.0,0.846749,50337211,1754.0,1917.096812,3139.0,1340.0,546.880959,176494444,1056.0,1497.327157,4824.0,506.0,1076.301981,0,0.0,0.0,0.0,0.0,0.0,1.333333,1.610304,1.0,1.0,4.0,0.846749,1562.333374,1917.096802,1754.0,1340.0,3139.0,546.880981,1001.0,1497.327148,1056.0,506.0,4824.0,1076.302002,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.610304,1.610304,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,4.0,0.0,0.0,0.846749,0.846749,0.0,0.0,1873.0,1360.0,1754.0,0.0,1917.096802,1917.096802,1917.096802,0.0,1754.0,1754.0,1754.0,0.0,1340.0,1340.0,1340.0,0.0,3139.0,3139.0,3139.0,0.0,546.880981,546.880981,546.880981,0.0,1202.0,834.0,621.0,0.0,1497.327148,1497.327148,1497.327148,0.0,1056.0,1056.0,1056.0,0.0,506.0,506.0,506.0,0.0,4824.0,4824.0,4824.0,0.0,1076.302002,1076.302002,1076.302002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.956923


In [26]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'shop_item_cnt_block',
       'item_category_id', 'item_cnt_block', 'shop_cnt_block',
       'category_cnt_block', 'shop_category_cnt_block', 'month',
       'item_id_mean_encoding', 'shop_id_mean_encoding',
       'item_category_id_mean_encoding', 'item_cnt_block_units',
       'item_cnt_block_median', 'item_cnt_block_mean',
       'item_cnt_block_max', 'item_cnt_block_min', 'item_cnt_block_std',
       'shop_cnt_block_units', 'shop_cnt_block_median',
       'shop_cnt_block_mean', 'shop_cnt_block_max', 'shop_cnt_block_min',
       'shop_cnt_block_std', 'category_cnt_block_units',
       'category_cnt_block_median', 'category_cnt_block_mean',
       'category_cnt_block_max', 'category_cnt_block_min',
       'category_cnt_block_std', 'shop_category_cnt_block_units',
       'shop_category_cnt_block_median', 'shop_category_cnt_block_mean',
       'shop_category_cnt_block_max', 'shop_category_cnt_block_min',
       'shop_category_cnt_block_std

In [28]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['shop_item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['shop_item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [29]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'shop_item_cnt_block',
       'item_category_id', 'item_cnt_block', 'shop_cnt_block',
       'category_cnt_block', 'shop_category_cnt_block', 'month',
       'item_id_mean_encoding', 'shop_id_mean_encoding',
       'item_category_id_mean_encoding', 'item_cnt_block_units',
       'item_cnt_block_median', 'item_cnt_block_mean',
       'item_cnt_block_max', 'item_cnt_block_min', 'item_cnt_block_std',
       'shop_cnt_block_units', 'shop_cnt_block_median',
       'shop_cnt_block_mean', 'shop_cnt_block_max', 'shop_cnt_block_min',
       'shop_cnt_block_std', 'category_cnt_block_units',
       'category_cnt_block_median', 'category_cnt_block_mean',
       'category_cnt_block_max', 'category_cnt_block_min',
       'category_cnt_block_std', 'shop_category_cnt_block_units',
       'shop_category_cnt_block_median', 'shop_category_cnt_block_mean',
       'shop_category_cnt_block_max', 'shop_category_cnt_block_min',
       'shop_category_cnt_block_std

In [46]:
features = [
    

       #'item_id_mean_encoding', 'shop_id_mean_encoding','item_category_id_mean_encoding',
 'item_cnt_block_lag_1',
       'item_cnt_block_lag_3', 'item_cnt_block_lag_6',
       'item_cnt_block_lag_12', 'item_cnt_block_mean_lag_1',
       'item_cnt_block_mean_lag_3', 'item_cnt_block_mean_lag_6',
       'item_cnt_block_mean_lag_12', 'item_cnt_block_median_lag_1',
       'item_cnt_block_median_lag_3', 'item_cnt_block_median_lag_6',
       'item_cnt_block_median_lag_12', 'item_cnt_block_min_lag_1',
       'item_cnt_block_min_lag_3', 'item_cnt_block_min_lag_6',
       'item_cnt_block_min_lag_12', 'item_cnt_block_max_lag_1',
       'item_cnt_block_max_lag_3', 'item_cnt_block_max_lag_6',
       'item_cnt_block_max_lag_12', 'item_cnt_block_std_lag_1',
       'item_cnt_block_std_lag_3', 'item_cnt_block_std_lag_6',
       'item_cnt_block_std_lag_12', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_3', 'shop_cnt_block_lag_6',
       'shop_cnt_block_lag_12', 'shop_cnt_block_mean_lag_1',
       'shop_cnt_block_mean_lag_3', 'shop_cnt_block_mean_lag_6',
       'shop_cnt_block_mean_lag_12', 'shop_cnt_block_median_lag_1',
       'shop_cnt_block_median_lag_3', 'shop_cnt_block_median_lag_6',
       'shop_cnt_block_median_lag_12', 'shop_cnt_block_min_lag_1',
       'shop_cnt_block_min_lag_3', 'shop_cnt_block_min_lag_6',
       'shop_cnt_block_min_lag_12', 'shop_cnt_block_max_lag_1',
       'shop_cnt_block_max_lag_3', 'shop_cnt_block_max_lag_6',
       'shop_cnt_block_max_lag_12', 'shop_cnt_block_std_lag_1',
       'shop_cnt_block_std_lag_3', 'shop_cnt_block_std_lag_6',
       'shop_cnt_block_std_lag_12', 'category_cnt_block_lag_1',
       'category_cnt_block_lag_3', 'category_cnt_block_lag_6',
       'category_cnt_block_lag_12', 'category_cnt_block_mean_lag_1',
       'category_cnt_block_mean_lag_3', 'category_cnt_block_mean_lag_6',
       'category_cnt_block_mean_lag_12',
       'category_cnt_block_median_lag_1',
       'category_cnt_block_median_lag_3',
       'category_cnt_block_median_lag_6',
       'category_cnt_block_median_lag_12', 'category_cnt_block_min_lag_1',
       'category_cnt_block_min_lag_3', 'category_cnt_block_min_lag_6',
       'category_cnt_block_min_lag_12', 'category_cnt_block_max_lag_1',
       'category_cnt_block_max_lag_3', 'category_cnt_block_max_lag_6',
       'category_cnt_block_max_lag_12', 'category_cnt_block_std_lag_1',
       'category_cnt_block_std_lag_3', 'category_cnt_block_std_lag_6',
       'category_cnt_block_std_lag_12', 'shop_category_cnt_block_lag_1',
       'shop_category_cnt_block_lag_3', 'shop_category_cnt_block_lag_6',
       'shop_category_cnt_block_lag_12',
       'shop_category_cnt_block_mean_lag_1',
       'shop_category_cnt_block_mean_lag_3',
       'shop_category_cnt_block_mean_lag_6',
       'shop_category_cnt_block_mean_lag_12',
       'shop_category_cnt_block_median_lag_1',
       'shop_category_cnt_block_median_lag_3',
       'shop_category_cnt_block_median_lag_6',
       'shop_category_cnt_block_median_lag_12',
       'shop_category_cnt_block_min_lag_1',
       'shop_category_cnt_block_min_lag_3',
       'shop_category_cnt_block_min_lag_6',
       'shop_category_cnt_block_min_lag_12',
       'shop_category_cnt_block_max_lag_1',
       'shop_category_cnt_block_max_lag_3',
       'shop_category_cnt_block_max_lag_6',
       'shop_category_cnt_block_max_lag_12',
       'shop_category_cnt_block_std_lag_1',
       'shop_category_cnt_block_std_lag_3',
       'shop_category_cnt_block_std_lag_6',
       'shop_category_cnt_block_std_lag_12', 'blocks_without_sales',
       'item_mean_day_between_activity'

]

In [54]:
features

['item_cnt_block_lag_1',
 'item_cnt_block_mean_lag_1',
 'item_cnt_block_median_lag_1',
 'shop_cnt_block_median_lag_1',
 'category_cnt_block_min_lag_1',
 'shop_category_cnt_block_lag_1',
 'shop_category_cnt_block_std_lag_1',
 'item_mean_day_between_activity']

In [55]:

features = ['item_cnt_block_lag_1',
 'item_cnt_block_mean_lag_1',
 'item_cnt_block_median_lag_1',
 'shop_cnt_block_median_lag_1',
 'category_cnt_block_min_lag_1',
 'shop_category_cnt_block_lag_1',
 'shop_category_cnt_block_std_lag_1',
 'item_mean_day_between_activity']




In [60]:
cb_model = CatBoostRegressor(iterations=6000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 30,
                             #border_count=32, #number of splits for num features (default 128 on GPU)
                             #bagging_temperature = 20, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             #random_strength = 100,#default 1  adds randomness to the split score
                             depth=4,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             #cat_features=[0],
                            #learning_rate=0.001, #default is 0.03
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.6516421	test: 1.3296847	best: 1.3296847 (0)	total: 12.9ms	remaining: 1m 17s
1:	learn: 1.6360048	test: 1.3198429	best: 1.3198429 (1)	total: 25.4ms	remaining: 1m 16s
2:	learn: 1.6210050	test: 1.3129787	best: 1.3129787 (2)	total: 38.1ms	remaining: 1m 16s
3:	learn: 1.6065856	test: 1.3045729	best: 1.3045729 (3)	total: 50.7ms	remaining: 1m 15s
4:	learn: 1.5930362	test: 1.2951511	best: 1.2951511 (4)	total: 63ms	remaining: 1m 15s
5:	learn: 1.5798681	test: 1.2878570	best: 1.2878570 (5)	total: 75.6ms	remaining: 1m 15s
6:	learn: 1.5673109	test: 1.2824001	best: 1.2824001 (6)	total: 87.8ms	remaining: 1m 15s
7:	learn: 1.5554706	test: 1.2747052	best: 1.2747052 (7)	total: 100ms	remaining: 1m 15s
8:	learn: 1.5441226	test: 1.2670734	best: 1.2670734 (8)	total: 112ms	remaining: 1m 14s
9:	learn: 1.5330120	test: 1.2591572	best: 1.2591572 (9)	total: 124ms	remaining: 1m 14s
10:	learn: 1.5226075	test: 1.2532604	best: 1.2532604 (10)	total: 136ms	remaining: 1m 14s
11:	learn: 1.5125448	test: 1.2481510

102:	learn: 1.2243914	test: 1.1022605	best: 1.1022605 (102)	total: 1.24s	remaining: 1m 11s
103:	learn: 1.2236691	test: 1.1020202	best: 1.1020202 (103)	total: 1.25s	remaining: 1m 11s
104:	learn: 1.2225181	test: 1.1016226	best: 1.1016226 (104)	total: 1.27s	remaining: 1m 11s
105:	learn: 1.2218843	test: 1.1014861	best: 1.1014861 (105)	total: 1.28s	remaining: 1m 11s
106:	learn: 1.2209144	test: 1.1007159	best: 1.1007159 (106)	total: 1.29s	remaining: 1m 11s
107:	learn: 1.2199834	test: 1.1005216	best: 1.1005216 (107)	total: 1.3s	remaining: 1m 11s
108:	learn: 1.2191372	test: 1.0996963	best: 1.0996963 (108)	total: 1.32s	remaining: 1m 11s
109:	learn: 1.2184379	test: 1.0994369	best: 1.0994369 (109)	total: 1.33s	remaining: 1m 11s
110:	learn: 1.2176875	test: 1.0985748	best: 1.0985748 (110)	total: 1.34s	remaining: 1m 11s
111:	learn: 1.2164434	test: 1.0980203	best: 1.0980203 (111)	total: 1.35s	remaining: 1m 11s
112:	learn: 1.2157910	test: 1.0976891	best: 1.0976891 (112)	total: 1.36s	remaining: 1m 11s


201:	learn: 1.1707002	test: 1.0688584	best: 1.0688584 (201)	total: 2.46s	remaining: 1m 10s
202:	learn: 1.1704916	test: 1.0688494	best: 1.0688494 (202)	total: 2.47s	remaining: 1m 10s
203:	learn: 1.1703014	test: 1.0688785	best: 1.0688494 (202)	total: 2.48s	remaining: 1m 10s
204:	learn: 1.1700594	test: 1.0687165	best: 1.0687165 (204)	total: 2.5s	remaining: 1m 10s
205:	learn: 1.1695868	test: 1.0686854	best: 1.0686854 (205)	total: 2.51s	remaining: 1m 10s
206:	learn: 1.1692638	test: 1.0684722	best: 1.0684722 (206)	total: 2.52s	remaining: 1m 10s
207:	learn: 1.1689934	test: 1.0684155	best: 1.0684155 (207)	total: 2.53s	remaining: 1m 10s
208:	learn: 1.1687869	test: 1.0681045	best: 1.0681045 (208)	total: 2.54s	remaining: 1m 10s
209:	learn: 1.1685941	test: 1.0679645	best: 1.0679645 (209)	total: 2.56s	remaining: 1m 10s
210:	learn: 1.1681609	test: 1.0679069	best: 1.0679069 (210)	total: 2.57s	remaining: 1m 10s
211:	learn: 1.1679321	test: 1.0677096	best: 1.0677096 (211)	total: 2.58s	remaining: 1m 10s


296:	learn: 1.1477711	test: 1.0509560	best: 1.0509509 (295)	total: 3.68s	remaining: 1m 10s
297:	learn: 1.1475501	test: 1.0506732	best: 1.0506732 (297)	total: 3.7s	remaining: 1m 10s
298:	learn: 1.1469000	test: 1.0494460	best: 1.0494460 (298)	total: 3.71s	remaining: 1m 10s
299:	learn: 1.1466904	test: 1.0492291	best: 1.0492291 (299)	total: 3.72s	remaining: 1m 10s
300:	learn: 1.1465734	test: 1.0491092	best: 1.0491092 (300)	total: 3.73s	remaining: 1m 10s
301:	learn: 1.1464365	test: 1.0490122	best: 1.0490122 (301)	total: 3.75s	remaining: 1m 10s
302:	learn: 1.1463127	test: 1.0489511	best: 1.0489511 (302)	total: 3.76s	remaining: 1m 10s
303:	learn: 1.1461119	test: 1.0487250	best: 1.0487250 (303)	total: 3.77s	remaining: 1m 10s
304:	learn: 1.1460303	test: 1.0487413	best: 1.0487250 (303)	total: 3.79s	remaining: 1m 10s
305:	learn: 1.1459244	test: 1.0486423	best: 1.0486423 (305)	total: 3.8s	remaining: 1m 10s
306:	learn: 1.1458469	test: 1.0486301	best: 1.0486301 (306)	total: 3.81s	remaining: 1m 10s
3

391:	learn: 1.1343272	test: 1.0396680	best: 1.0395823 (390)	total: 4.92s	remaining: 1m 10s
392:	learn: 1.1342720	test: 1.0396605	best: 1.0395823 (390)	total: 4.93s	remaining: 1m 10s
393:	learn: 1.1341750	test: 1.0395617	best: 1.0395617 (393)	total: 4.94s	remaining: 1m 10s
394:	learn: 1.1341158	test: 1.0394710	best: 1.0394710 (394)	total: 4.95s	remaining: 1m 10s
395:	learn: 1.1340329	test: 1.0394843	best: 1.0394710 (394)	total: 4.96s	remaining: 1m 10s
396:	learn: 1.1339076	test: 1.0393499	best: 1.0393499 (396)	total: 4.98s	remaining: 1m 10s
397:	learn: 1.1338244	test: 1.0392914	best: 1.0392914 (397)	total: 4.99s	remaining: 1m 10s
398:	learn: 1.1337012	test: 1.0392697	best: 1.0392697 (398)	total: 5s	remaining: 1m 10s
399:	learn: 1.1336286	test: 1.0392310	best: 1.0392310 (399)	total: 5.01s	remaining: 1m 10s
400:	learn: 1.1335627	test: 1.0392584	best: 1.0392310 (399)	total: 5.02s	remaining: 1m 10s
401:	learn: 1.1334812	test: 1.0393358	best: 1.0392310 (399)	total: 5.04s	remaining: 1m 10s
40

493:	learn: 1.1248718	test: 1.0358083	best: 1.0357170 (481)	total: 6.13s	remaining: 1m 8s
494:	learn: 1.1248132	test: 1.0357850	best: 1.0357170 (481)	total: 6.15s	remaining: 1m 8s
495:	learn: 1.1247658	test: 1.0358805	best: 1.0357170 (481)	total: 6.16s	remaining: 1m 8s
496:	learn: 1.1246917	test: 1.0357976	best: 1.0357170 (481)	total: 6.17s	remaining: 1m 8s
497:	learn: 1.1246411	test: 1.0357477	best: 1.0357170 (481)	total: 6.18s	remaining: 1m 8s
498:	learn: 1.1245713	test: 1.0357345	best: 1.0357170 (481)	total: 6.2s	remaining: 1m 8s
499:	learn: 1.1244988	test: 1.0356861	best: 1.0356861 (499)	total: 6.21s	remaining: 1m 8s
500:	learn: 1.1244158	test: 1.0358026	best: 1.0356861 (499)	total: 6.22s	remaining: 1m 8s
501:	learn: 1.1243547	test: 1.0357690	best: 1.0356861 (499)	total: 6.23s	remaining: 1m 8s
502:	learn: 1.1243052	test: 1.0357545	best: 1.0356861 (499)	total: 6.24s	remaining: 1m 8s
503:	learn: 1.1242249	test: 1.0357243	best: 1.0356861 (499)	total: 6.25s	remaining: 1m 8s
504:	learn:

595:	learn: 1.1179596	test: 1.0314384	best: 1.0314384 (595)	total: 7.35s	remaining: 1m 6s
596:	learn: 1.1179182	test: 1.0314336	best: 1.0314336 (596)	total: 7.36s	remaining: 1m 6s
597:	learn: 1.1178344	test: 1.0313767	best: 1.0313767 (597)	total: 7.37s	remaining: 1m 6s
598:	learn: 1.1177524	test: 1.0313212	best: 1.0313212 (598)	total: 7.38s	remaining: 1m 6s
599:	learn: 1.1176401	test: 1.0312263	best: 1.0312263 (599)	total: 7.4s	remaining: 1m 6s
600:	learn: 1.1176040	test: 1.0313237	best: 1.0312263 (599)	total: 7.41s	remaining: 1m 6s
601:	learn: 1.1175311	test: 1.0312335	best: 1.0312263 (599)	total: 7.42s	remaining: 1m 6s
602:	learn: 1.1174379	test: 1.0311407	best: 1.0311407 (602)	total: 7.43s	remaining: 1m 6s
603:	learn: 1.1173653	test: 1.0310513	best: 1.0310513 (603)	total: 7.45s	remaining: 1m 6s
604:	learn: 1.1173323	test: 1.0310580	best: 1.0310513 (603)	total: 7.46s	remaining: 1m 6s
605:	learn: 1.1172959	test: 1.0310650	best: 1.0310513 (603)	total: 7.47s	remaining: 1m 6s
606:	learn:

697:	learn: 1.1122979	test: 1.0284972	best: 1.0284710 (690)	total: 8.56s	remaining: 1m 5s
698:	learn: 1.1122704	test: 1.0285021	best: 1.0284710 (690)	total: 8.58s	remaining: 1m 5s
699:	learn: 1.1122229	test: 1.0284538	best: 1.0284538 (699)	total: 8.59s	remaining: 1m 5s
700:	learn: 1.1121920	test: 1.0285227	best: 1.0284538 (699)	total: 8.6s	remaining: 1m 5s
701:	learn: 1.1121634	test: 1.0286111	best: 1.0284538 (699)	total: 8.61s	remaining: 1m 5s
702:	learn: 1.1121085	test: 1.0285954	best: 1.0284538 (699)	total: 8.62s	remaining: 1m 4s
703:	learn: 1.1120808	test: 1.0286506	best: 1.0284538 (699)	total: 8.64s	remaining: 1m 4s
704:	learn: 1.1120310	test: 1.0286252	best: 1.0284538 (699)	total: 8.65s	remaining: 1m 4s
705:	learn: 1.1119225	test: 1.0285829	best: 1.0284538 (699)	total: 8.66s	remaining: 1m 4s
706:	learn: 1.1118846	test: 1.0285850	best: 1.0284538 (699)	total: 8.67s	remaining: 1m 4s
707:	learn: 1.1118587	test: 1.0285964	best: 1.0284538 (699)	total: 8.68s	remaining: 1m 4s
708:	learn:

799:	learn: 1.1081208	test: 1.0271942	best: 1.0269807 (786)	total: 9.78s	remaining: 1m 3s
800:	learn: 1.1080896	test: 1.0271864	best: 1.0269807 (786)	total: 9.79s	remaining: 1m 3s
801:	learn: 1.1080553	test: 1.0271764	best: 1.0269807 (786)	total: 9.8s	remaining: 1m 3s
802:	learn: 1.1080346	test: 1.0272001	best: 1.0269807 (786)	total: 9.81s	remaining: 1m 3s
803:	learn: 1.1079513	test: 1.0271111	best: 1.0269807 (786)	total: 9.83s	remaining: 1m 3s
804:	learn: 1.1078637	test: 1.0269289	best: 1.0269289 (804)	total: 9.84s	remaining: 1m 3s
805:	learn: 1.1078387	test: 1.0269431	best: 1.0269289 (804)	total: 9.85s	remaining: 1m 3s
806:	learn: 1.1077914	test: 1.0269896	best: 1.0269289 (804)	total: 9.86s	remaining: 1m 3s
807:	learn: 1.1077523	test: 1.0269829	best: 1.0269289 (804)	total: 9.88s	remaining: 1m 3s
808:	learn: 1.1077167	test: 1.0270027	best: 1.0269289 (804)	total: 9.89s	remaining: 1m 3s
809:	learn: 1.1076897	test: 1.0270454	best: 1.0269289 (804)	total: 9.9s	remaining: 1m 3s
810:	learn: 

893:	learn: 1.1048041	test: 1.0265749	best: 1.0265481 (889)	total: 11s	remaining: 1m 2s
894:	learn: 1.1047250	test: 1.0265504	best: 1.0265481 (889)	total: 11s	remaining: 1m 2s
895:	learn: 1.1047045	test: 1.0265490	best: 1.0265481 (889)	total: 11s	remaining: 1m 2s
896:	learn: 1.1046739	test: 1.0265080	best: 1.0265080 (896)	total: 11s	remaining: 1m 2s
897:	learn: 1.1046543	test: 1.0263950	best: 1.0263950 (897)	total: 11s	remaining: 1m 2s
898:	learn: 1.1046223	test: 1.0263854	best: 1.0263854 (898)	total: 11.1s	remaining: 1m 2s
899:	learn: 1.1045851	test: 1.0263427	best: 1.0263427 (899)	total: 11.1s	remaining: 1m 2s
900:	learn: 1.1045523	test: 1.0263740	best: 1.0263427 (899)	total: 11.1s	remaining: 1m 2s
901:	learn: 1.1045182	test: 1.0263758	best: 1.0263427 (899)	total: 11.1s	remaining: 1m 2s
902:	learn: 1.1045009	test: 1.0263421	best: 1.0263421 (902)	total: 11.1s	remaining: 1m 2s
903:	learn: 1.1044829	test: 1.0263517	best: 1.0263421 (902)	total: 11.1s	remaining: 1m 2s
904:	learn: 1.104443

990:	learn: 1.1015236	test: 1.0253729	best: 1.0253729 (990)	total: 12.2s	remaining: 1m 1s
991:	learn: 1.1014882	test: 1.0253589	best: 1.0253589 (991)	total: 12.2s	remaining: 1m 1s
992:	learn: 1.1014603	test: 1.0253305	best: 1.0253305 (992)	total: 12.2s	remaining: 1m 1s
993:	learn: 1.1014316	test: 1.0252870	best: 1.0252870 (993)	total: 12.3s	remaining: 1m 1s
994:	learn: 1.1014112	test: 1.0252942	best: 1.0252870 (993)	total: 12.3s	remaining: 1m 1s
995:	learn: 1.1013864	test: 1.0252819	best: 1.0252819 (995)	total: 12.3s	remaining: 1m 1s
996:	learn: 1.1013679	test: 1.0252633	best: 1.0252633 (996)	total: 12.3s	remaining: 1m 1s
997:	learn: 1.1013578	test: 1.0252845	best: 1.0252633 (996)	total: 12.3s	remaining: 1m 1s
998:	learn: 1.1013423	test: 1.0252351	best: 1.0252351 (998)	total: 12.3s	remaining: 1m 1s
999:	learn: 1.1013112	test: 1.0252257	best: 1.0252257 (999)	total: 12.3s	remaining: 1m 1s
1000:	learn: 1.1012931	test: 1.0252376	best: 1.0252257 (999)	total: 12.3s	remaining: 1m 1s
1001:	lea

[('item_cnt_block_lag_1', 26.22875224299283),
 ('item_mean_day_between_activity', 13.585862516290966),
 ('item_cnt_block_median_lag_1', 13.22265372614334),
 ('shop_cnt_block_median_lag_1', 13.028196995760414),
 ('shop_category_cnt_block_std_lag_1', 9.813052874991147),
 ('category_cnt_block_min_lag_1', 8.425777862073499),
 ('item_cnt_block_mean_lag_1', 8.222984676024875),
 ('shop_category_cnt_block_lag_1', 7.472719105722922)]

In [53]:
features = [item[0] for item in scores.items() if item[1] > 4]

In [61]:
#test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [158]:
test['month'] = 11

In [62]:
item_features = [ 
    'item_mean_day_between_activity'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')


In [160]:
shop_features = [
        'shop_id_mean_encoding','shop_share'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [161]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [172]:
shop_item_features = [
        'shop_item_share_of_shop_units_mean','shop_item_share_of_shop_units_mean'
]

merge_col = ['shop_id','item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')



In [163]:
month_features = [
        'month_mean_encoding'
]

merge_col = ['month']
cols=month_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [164]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_max')

test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')






item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
item_block_units 3
[['item_id', 'item_block_units_rolling_3']]
item_block_max 3
[['item_id', 'item_block_max_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_median 3
[['shop_id', 'item_category_id', 'shop_cat_block_median_rolling_3']]


In [165]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [63]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_cnt_block')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_cnt_block_mean')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_cnt_block_median')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_cnt_block_median')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_std')
test = add_lags_test(test, ['item_category_id','date_block_num'], 'category_cnt_block_min')

item_cnt_block 1
item_cnt_block_mean 1
item_cnt_block_median 1
shop_cnt_block_median 1
shop_category_cnt_block 1
shop_category_cnt_block_std 1
category_cnt_block_min 1


In [167]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')

shop_item_block_min 1
shop_item_block_max 1
shop_item_block_median 1
shop_item_block_units 1


In [168]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [169]:
test['date_block_num'] = 34

In [64]:
test.fillna(0, inplace=True)

In [65]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,item_mean_day_between_activity,item_cnt_block_lag_1,item_cnt_block_mean_lag_1,item_cnt_block_median_lag_1,shop_cnt_block_median_lag_1,shop_category_cnt_block_lag_1,shop_category_cnt_block_std_lag_1,category_cnt_block_min_lag_1
70,31,143394,58,37,0.014129,9.0,15.420597,13.0,2089.0,37.0,27.512695,2989.0
196040,20206,137433,48,40,999.0,0.0,0.0,0.0,1171.0,59.0,29.571269,6779.0
78363,8138,173296,37,40,38.833333,1.0,1.0,1.0,1041.0,29.0,22.53146,6779.0
173022,17440,125118,52,40,13.833333,2.0,1.333333,1.0,1174.0,128.0,44.601248,6779.0
40184,4103,163862,36,58,0.526316,3.0,3.33871,3.0,0.0,0.0,0.0,284.0
202536,20736,63957,22,72,0.022725,6.0,20.750712,14.0,1357.0,9.0,5.17502,1122.0
79490,8257,136223,48,40,6.15,0.0,0.0,0.0,1171.0,59.0,29.571269,6779.0
4225,616,132050,47,78,18.833333,2.0,1.486486,1.0,1926.0,0.0,0.0,30.0
8726,1201,163776,36,55,0.008393,17.0,27.526468,26.0,0.0,0.0,0.0,4913.0
131579,13464,178860,38,23,0.027953,29.0,40.67907,38.0,1403.0,122.0,46.707909,2203.0


In [66]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.1587636 , 0.04395435, 0.20497198, ..., 0.19120975, 0.24543261,
       0.14818668])

In [67]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.9544127214376881
20.0


In [None]:
cb_preds[0:100]

In [68]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds

submission.to_csv('submission.csv', index=False)

In [None]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
training['item_me_real']= training.groupby('item_id')['item_me'].transform(np.mean)
training['cat_me_real']= training.groupby('item_category_id')['item_me'].transform(np.mean)

In [None]:
training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)



In [182]:
predictions = []


for i in range(10):

    cb_model = CatBoostRegressor(iterations=6000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 3,
                             #border_count=32, #number of splits for num features (default 128 on GPU)
                             bagging_temperature = 20, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             #random_strength = 100,#default 1  adds randomness to the split score
                             #depth=8,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             #cat_features=[0],
                            #learning_rate=0.001, #default is 0.03
                             random_seed = i)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


    cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=False)

    cb_preds = cb_model.predict(test[features])
    cb_preds.clip(0,20,out=cb_preds)
    predictions.append(cb_preds)

In [183]:
prediction = np.mean(predictions, axis=0)

In [184]:
print(np.mean(prediction))
print(np.max(prediction))

0.38936379395844056
16.943752498637696


In [185]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = prediction

submission.to_csv('submission.csv', index=False)