In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import gc
import pickle as pickle
import lightgbm as lgbm
import catboost as cb


from sklearn.model_selection import KFold
from itertools import product
from catboost import CatBoostRegressor


In [3]:
gc.collect()

items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)


In [4]:
sales_train = sales_train[sales_train['year'].isin([2013]) == False]

sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
train_item_ids = sales_train['item_id'].unique()
#train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
#train_item_ids = ids_keep
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

#all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_item_ids = test_item_ids

#all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))
all_shop_ids = test_shop_ids



In [6]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    #item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    item_ids = all_item_ids
    #dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    dbn_combos = list(product(all_shop_ids, item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [7]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['shop_item_cnt_block'] = training['shop_item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [8]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')
training['year'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['year']), downcast='unsigned')



In [9]:
ys = sales_train.groupby(['item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = training.merge(ys, on=['item_id', 'date_block_num'], how='left').fillna(0)

ys = sales_train.groupby(['shop_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_cnt_block"})

training = training.merge(ys, on=['shop_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"category_cnt_block"})


training = training.merge(ys, on=['item_category_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['shop_id', 'item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_category_cnt_block"})

training = training.merge(ys, on=['shop_id', 'item_category_id', 'date_block_num'], how='left').fillna(0)


In [10]:
mean_prices = sales_train.groupby(['item_id','date_block_num'])['item_price'].mean().reset_index()
training = training.merge(mean_prices, on=['item_id','date_block_num'], how='left')

In [11]:
training["shop_cat"] = training["shop_id"].astype(str) + "_" + training["item_category_id"].astype(str)

In [12]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
columns = ["item_id", "shop_id", "item_category_id", "shop_cat"]



y_train = training["shop_item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['shop_item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [13]:
training['item_cnt_block_mean'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.mean)
#training['item_cnt_block_min'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.min)
#training['item_cnt_block_max'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.max)
#training['item_cnt_block_std'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.std)
#training['item_cnt_block_med'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.median)

training['shop_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.mean)
#training['shop_cnt_block_min'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.min)
#training['shop_cnt_block_max'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.max)
#training['shop_cnt_block_std'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.std)
#training['shop_cnt_block_med'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.median)

training['category_cnt_block_mean'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.mean)
#training['category_cnt_block_min'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.min)
#training['category_cnt_block_max'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.max)
#training['category_cnt_block_std'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.std)
#training['category_cnt_block_med'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.median)

training['shop_category_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.mean)
#training['shop_category_cnt_block_min'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.min)
#training['shop_category_cnt_block_max'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.max)
#training['shop_category_cnt_block_std'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.std)
#training['shop_category_cnt_block_med'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.median)

#training['shop_item_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.mean)
#training['shop_item_cnt_block_min'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.min)
#training['shop_item_cnt_block_max'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.max)
#training['shop_item_cnt_block_std'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.std)
#training['shop_item_cnt_block_med'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.median)

In [19]:
training['comp1'] = training['item_cnt_block'] * training['shop_cnt_block']
training['comp2'] = training['item_cnt_block'] * training['category_cnt_block']
training['comp3'] = training['item_cnt_block'] * training['shop_category_cnt_block']

training['comp4'] = training['item_cnt_block'] * training['item_id_mean_encoding']
training['comp5'] = training['item_cnt_block'] * training['shop_id_mean_encoding']
training['comp6'] = training['item_cnt_block'] * training['shop_cat_mean_encoding']

training['comp7'] = training['shop_cnt_block'] * training['shop_id_mean_encoding']
training['comp8'] = training['shop_cnt_block'] * training['shop_cat_mean_encoding']

training['comp9'] = training['shop_category_cnt_block'] * training['shop_id_mean_encoding']
training['comp10'] = training['shop_category_cnt_block'] * training['shop_cat_mean_encoding']

training['comp11'] = training['item_cnt_block_mean'] * training['shop_cnt_block_mean']
training['comp12'] = training['item_cnt_block_mean'] * training['category_cnt_block_mean']

training['comp13'] = training['item_cnt_block_mean'] * training['shop_category_cnt_block_mean']
training['comp14'] = training['item_cnt_block'] * training['shop_category_cnt_block_mean']

In [14]:
training.fillna(0,inplace=True)

In [15]:
def add_lags(df, cols, name, lags = [1,2,3,5,6]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_mean')

item_cnt_block 1
item_cnt_block 2
item_cnt_block 3
item_cnt_block 5
item_cnt_block 6
item_cnt_block_mean 1
item_cnt_block_mean 2
item_cnt_block_mean 3
item_cnt_block_mean 5
item_cnt_block_mean 6
shop_cnt_block 1
shop_cnt_block 2
shop_cnt_block 3
shop_cnt_block 5
shop_cnt_block 6
shop_cnt_block_mean 1
shop_cnt_block_mean 2
shop_cnt_block_mean 3
shop_cnt_block_mean 5
shop_cnt_block_mean 6
category_cnt_block 1
category_cnt_block 2
category_cnt_block 3
category_cnt_block 5
category_cnt_block 6
category_cnt_block_mean 1
category_cnt_block_mean 2
category_cnt_block_mean 3
category_cnt_block_mean 5
category_cnt_block_mean 6
shop_category_cnt_block 1
shop_category_cnt_block 2
shop_category_cnt_block 3
shop_category_cnt_block 5
shop_category_cnt_block 6
shop_category_cnt_block_mean 1
shop_category_cnt_block_mean 2
shop_category_cnt_block_mean 3
shop_category_cnt_block_mean 5
shop_category_cnt_block_mean 6


In [22]:
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp1')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'comp2')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp3')
training = add_lags(training, ['item_id','date_block_num'], 'comp4')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp5')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp6')

training = add_lags(training, ['shop_id','date_block_num'], 'comp7')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'comp8')

comp1 1
comp2 1
comp3 1
comp4 1
comp5 1
comp6 1
comp7 1
comp8 1


In [23]:
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'comp9')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'comp10')


training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp11')
training = add_lags(training, ['item_id','date_block_num'], 'comp12')

training = add_lags(training, ['item_id','shop_id','date_block_num'], 'comp13')
training = add_lags(training, ['item_id','shop_id','date_block_num'], 'comp14')

comp9 1
comp10 1
comp11 1
comp12 1
comp13 1
comp14 1


In [17]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'shop_item_cnt_block',
       'item_category_id', 'month', 'year', 'item_cnt_block',
       'shop_cnt_block', 'category_cnt_block', 'shop_category_cnt_block',
       'item_price', 'shop_cat', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'shop_cat_mean_encoding', 'item_cnt_block_mean',
       'shop_cnt_block_mean', 'category_cnt_block_mean',
       'shop_category_cnt_block_mean', 'item_cnt_block_lag_1',
       'item_cnt_block_lag_2', 'item_cnt_block_lag_3',
       'item_cnt_block_lag_5', 'item_cnt_block_lag_6',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_2',
       'item_cnt_block_mean_lag_3', 'item_cnt_block_mean_lag_5',
       'item_cnt_block_mean_lag_6', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_2', 'shop_cnt_block_lag_3',
       'shop_cnt_block_lag_5', 'shop_cnt_block_lag_6',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_2',
       'shop_cnt_bl

In [18]:
training.fillna(0,inplace=True)

In [69]:
features = [
  
  # 'item_id_mean_encoding',
   #    'shop_id_mean_encoding', 'item_category_id_mean_encoding',
    #   'shop_cat_mean_encoding',
  'item_cnt_block_lag_1',
       'item_cnt_block_lag_2', 'item_cnt_block_lag_3',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_2',
       'item_cnt_block_mean_lag_3', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_2', 'shop_cnt_block_lag_3',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_2',
       'shop_cnt_block_mean_lag_3', 'category_cnt_block_lag_1',
       'category_cnt_block_lag_2', 'category_cnt_block_lag_3',
       'category_cnt_block_mean_lag_1', 'category_cnt_block_mean_lag_2',
       'category_cnt_block_mean_lag_3', 'shop_category_cnt_block_lag_1',
       'shop_category_cnt_block_lag_2', 'shop_category_cnt_block_lag_3',
       'shop_category_cnt_block_mean_lag_1',
       'shop_category_cnt_block_mean_lag_2',
       'shop_category_cnt_block_mean_lag_3', 'comp1_lag_1', 'comp1_lag_2',
       'comp1_lag_3', 'comp2_lag_1', 'comp2_lag_2', 'comp2_lag_3',
       'comp3_lag_1', 'comp3_lag_2', 'comp3_lag_3', 'comp4_lag_1',
       'comp4_lag_2', 'comp4_lag_3', 'comp5_lag_1', 'comp5_lag_2',
       'comp5_lag_3', 'comp6_lag_1', 'comp6_lag_2', 'comp6_lag_3',
       'comp7_lag_1', 'comp7_lag_2', 'comp7_lag_3', 'comp8_lag_1',
       'comp8_lag_2', 'comp8_lag_3', 'comp9_lag_1', 'comp9_lag_2',
       'comp9_lag_3', 'comp10_lag_1', 'comp10_lag_2', 'comp10_lag_3',
       'comp11_lag_1', 'comp11_lag_2', 'comp11_lag_3', 'comp12_lag_1',
       'comp12_lag_2', 'comp12_lag_3', 'comp13_lag_1', 'comp13_lag_2',
       'comp13_lag_3', 'comp14_lag_1', 'comp14_lag_2', 'comp14_lag_3'
     
]

In [22]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,shop_item_cnt_block,item_category_id,month,year,item_cnt_block,shop_cnt_block,category_cnt_block,shop_category_cnt_block,item_price,shop_cat,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,shop_cat_mean_encoding,item_cnt_block_mean,shop_cnt_block_mean,category_cnt_block_mean,shop_category_cnt_block_mean,comp1,comp2,comp3,comp4,comp5,comp6,comp7,comp8,comp9,comp10,comp11,comp12,comp13,comp14,item_cnt_block_lag_1,item_cnt_block_lag_2,item_cnt_block_lag_3,item_cnt_block_mean_lag_1,item_cnt_block_mean_lag_2,item_cnt_block_mean_lag_3,shop_cnt_block_lag_1,shop_cnt_block_lag_2,shop_cnt_block_lag_3,shop_cnt_block_mean_lag_1,shop_cnt_block_mean_lag_2,shop_cnt_block_mean_lag_3,category_cnt_block_lag_1,category_cnt_block_lag_2,category_cnt_block_lag_3,category_cnt_block_mean_lag_1,category_cnt_block_mean_lag_2,category_cnt_block_mean_lag_3,shop_category_cnt_block_lag_1,shop_category_cnt_block_lag_2,shop_category_cnt_block_lag_3,shop_category_cnt_block_mean_lag_1,shop_category_cnt_block_mean_lag_2,shop_category_cnt_block_mean_lag_3,comp1_lag_1,comp1_lag_2,comp1_lag_3,comp2_lag_1,comp2_lag_2,comp2_lag_3,comp3_lag_1,comp3_lag_2,comp3_lag_3,comp4_lag_1,comp4_lag_2,comp4_lag_3,comp5_lag_1,comp5_lag_2,comp5_lag_3,comp6_lag_1,comp6_lag_2,comp6_lag_3,comp7_lag_1,comp7_lag_2,comp7_lag_3,comp8_lag_1,comp8_lag_2,comp8_lag_3,comp9_lag_1,comp9_lag_2,comp9_lag_3,comp10_lag_1,comp10_lag_2,comp10_lag_3,comp11_lag_1,comp11_lag_2,comp11_lag_3,comp12_lag_1,comp12_lag_2,comp12_lag_3,comp13_lag_1,comp13_lag_2,comp13_lag_3,comp14_lag_1,comp14_lag_2,comp14_lag_3
1135879,5184,19,31,0,67,8,2015,1.0,1533.0,409.0,10.0,1099.0,19_67,0.022576,0.233433,0.141483,0.116264,11.840196,1551.5,3253.890392,75.921218,1533.0,409.0,10.0,0.022576,0.233433,0.116264,357.852731,178.232528,2.33433,1.162639,18370.064216,38526.700261,898.922113,75.921218,1,1,3,10.808824,10.833333,11.949804,1440,1300,1459,1427.642822,1430.904785,1592.166626,474,417,512,3335.517822,3318.272217,3428.631348,7,12,8,75.583504,74.266708,75.539597,1440,1300,4377,1422,3753,2048,7,12,24,0,0,0,0,0,0,0,0,0,338,307,342,165,162,167,1,2,1,0,1,0,15431,15501,19026,36053,35947,40971,816,804,902,75,74,226
4396263,20565,53,15,0,72,4,2014,0.0,1641.0,1850.0,37.0,0.0,53_72,0.012146,0.212639,0.140252,0.102229,9.003333,1842.166667,5550.332157,106.891709,0.0,0.0,0.0,0.0,0.0,0.0,348.940191,167.757504,7.867634,3.782467,16585.640556,49971.490519,962.381684,0.0,0,0,0,11.849607,10.279804,10.266078,2059,1866,2144,2216.380859,2127.285645,2220.047607,1866,1860,1785,6547.391113,5804.443359,6563.625,35,19,19,125.359169,112.041161,123.881836,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,437,386,445,212,189,221,7,3,3,3,1,1,26263,21868,22791,77584,59668,67382,1485,1151,1271,0,0,0
2771330,13249,18,24,0,47,1,2015,0.0,1337.0,0.0,0.0,0.0,18_47,0.0,0.186742,0.041803,0.017341,16.166078,2176.380952,5477.754314,107.258562,0.0,0.0,0.0,0.0,0.0,0.0,249.674621,23.184971,0.0,0.0,35183.545173,88553.805863,1733.950327,0.0,0,0,0,24.733137,16.610001,13.808432,1951,1498,1030,3318.5,2298.428467,1926.547607,0,0,0,7494.178711,5516.103027,5176.736328,0,0,0,148.356445,106.721191,96.386688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,361,279,191,35,27,18,0,0,0,0,0,0,82076,38176,26602,185354,91622,71482,3669,1772,1330,0,0,0
1746268,8249,55,30,0,38,7,2015,3.0,1658.0,702.0,0.0,199.0,55_38,0.110505,0.155304,0.14276,0.0,10.808824,1427.642857,3335.517843,75.583501,4974.0,2106.0,0.0,0.331514,0.465911,0.0,257.493405,0.0,0.0,0.0,15431.139706,36053.023746,816.968728,226.750504,1,4,5,10.833333,11.949804,12.53549,1909,2117,3422,1430.904785,1592.166626,1711.166626,640,926,1148,3318.272217,3428.631348,4010.790771,0,0,0,74.266708,75.539597,88.694931,1909,8468,17110,7040,19446,19516,0,0,0,0,0,0,0,0,0,0,0,0,291,326,521,0,0,0,0,0,0,0,0,0,15501,19026,21450,35947,40971,50277,804,902,1111,74,302,443
1080106,4964,57,28,1,20,5,2015,16.0,2408.0,7130.0,133.0,1593.0,57_20,0.491781,0.403268,0.599045,0.765671,11.949804,1592.166667,3428.631373,75.539594,38528.0,114080.0,2128.0,7.868493,6.452283,12.250736,971.068597,1843.735801,53.634603,101.834245,19026.079477,40971.472621,902.683335,1208.633501,12,23,25,12.53549,12.332941,12.527647,2860,3113,3540,1711.166626,1701.571411,1716.571411,4768,5399,4915,4010.790771,4584.216309,4371.776367,94,120,157,88.694931,94.878365,88.835052,34320,71599,88500,0,0,0,1128,2760,3925,6,13,14,4,9,10,9,17,19,1161,1269,1427,2111,2404,2734,37,48,63,69,92,121,21450,20985,21504,50277,56536,54768,1111,1170,1112,1064,2182,2220
1456867,6694,45,17,0,23,6,2014,0.0,1305.0,4655.0,62.0,0.0,45_23,0.060027,0.126859,0.465287,0.232998,10.760588,1878.785714,5413.62451,104.024024,0.0,0.0,0.0,0.0,0.0,0.0,165.551117,304.062983,7.865264,14.445904,20216.839454,58253.78421,1119.359692,0.0,0,0,0,10.428627,9.003333,11.849607,1174,1015,1401,1873.214233,1842.166626,2216.380859,4773,4249,5679,5214.182129,5550.332031,6547.391113,53,47,80,99.610313,106.891708,125.359169,0,0,0,114552,84980,0,0,0,0,0,0,0,0,0,0,0,0,0,150,129,177,269,239,340,6,6,10,12,11,19,19535,16585,26263,54376,49971,77584,1038,962,1485,0,0,0
706813,3440,57,31,0,31,8,2015,6.0,2780.0,724.0,0.0,499.0,57_31,0.150345,0.403268,0.035023,0.0,11.840196,1551.5,3253.890392,75.921218,16680.0,4344.0,0.0,0.902069,2.419606,0.0,1121.084178,0.0,0.0,0.0,18370.064216,38526.700261,898.922113,455.527311,8,3,5,10.808824,10.833333,11.949804,2352,2440,2408,1427.642822,1430.904785,1592.166626,688,892,1138,3335.517822,3318.272217,3428.631348,0,0,0,75.583504,74.266708,75.539597,18816,7320,12040,0,0,0,0,0,0,0,0,0,3,1,2,0,0,0,955,991,978,0,0,0,0,0,0,0,0,0,15431,15501,19026,36053,35947,40971,816,804,902,604,222,377
2697421,13048,19,13,6,40,2,2014,177.0,2176.0,18467.0,394.0,391.121528,19_40,0.4,0.231251,0.157989,0.157693,10.279804,2127.285714,5804.443137,112.041158,385152.0,3268659.0,69738.0,70.8,40.931472,27.911739,503.202725,343.140924,91.112993,62.131215,21868.080028,59668.537325,1151.761133,19831.28493,166,0,0,10.266078,0.0,0.0,2596,0,0,2220.047607,0.0,0.0,22065,0,0,6563.625,0.0,0.0,454,0,0,123.881836,0.0,0.0,430936,0,0,1279770,0,0,75364,0,0,71,0,0,39,0,0,26,0,0,610,0,0,408,0,0,106,0,0,71,0,0,22791,0,0,67382,0,0,1271,0,0,20564,0,0
3621940,16221,52,26,0,64,3,2015,0.0,1182.0,1328.0,24.0,0.0,52_64,0.160274,0.16194,0.168714,0.220324,12.332941,1701.571429,4584.216078,94.878361,0.0,0.0,0.0,0.0,0.0,0.0,191.413168,260.422662,3.886562,5.28777,20985.380336,56536.867236,1170.129249,0.0,0,0,0,12.527647,16.166079,24.733137,1117,1247,2243,1716.571411,2176.380859,3318.5,1655,2211,3163,4371.776367,5477.754395,7494.178711,38,53,49,88.835052,107.25856,148.356445,0,0,0,0,2211,3163,0,0,0,0,0,0,0,0,0,0,0,0,177,205,363,220,274,476,6,8,7,7,11,10,21504,35183,82076,54768,88553,185354,1112,1733,3669,0,0,0
976671,4479,2,15,0,20,4,2014,0.0,791.0,2093.0,47.0,0.0,2_20,1.033512,0.126389,0.601056,0.645543,9.003333,1842.166667,5550.332157,106.891709,0.0,0.0,0.0,0.0,0.0,0.0,99.973599,510.624419,5.940277,30.340515,16585.640556,49971.490519,962.381684,0.0,0,0,0,11.849607,10.279804,10.266078,990,911,890,2216.380859,2127.285645,2220.047607,5815,3631,1880,6547.391113,5804.443359,6563.625,131,72,21,125.359169,112.041161,123.881836,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,125,115,110,588,543,574,16,8,2,77,42,13,26263,21868,22791,77584,59668,67382,1485,1151,1271,0,0,0


In [19]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['shop_item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['shop_item_cnt_block']

#pos_val_len = len(y_val[y_val != 0])
#print("pos_val_len", pos_val_len)

#zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
#print("zeros_keep_indices_val", len(zeros_keep_indices_val))
#non_zeros_val_indices = y_val[y_val != 0].index
#print("non_zeros_val_indices", len(non_zeros_val_indices))

#val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

#y_val = y_val.loc[val_indices]
#x_val = x_val.loc[val_indices]

In [82]:
lgtrain = lgbm.Dataset(x_train[features], label=y_train)
lgval = lgbm.Dataset(x_val[features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 8,
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        "learning_rate" : 0.05,
        "min_data_in_leaf": 10000,
        "num_leaves": 10,
        "max_depth" : 4,
        #"bagging_fraction": 0.7,
        #"bagging_freq": 1,
        #"feature_fraction": 0.7,
        #"lambda_l1": 10,
}

evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 10000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=10, 
                      verbose_eval=10, 
                      evals_result=evals_result)


scores = {}
for i,score in enumerate(model_lgb.feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

Training until validation scores don't improve for 10 rounds.
[10]	valid_0's rmse: 0.995522
[20]	valid_0's rmse: 0.957153
[30]	valid_0's rmse: 0.940218
[40]	valid_0's rmse: 0.930111
[50]	valid_0's rmse: 0.92395
[60]	valid_0's rmse: 0.919965
[70]	valid_0's rmse: 0.917632
[80]	valid_0's rmse: 0.91653
[90]	valid_0's rmse: 0.915139
[100]	valid_0's rmse: 0.914468
[110]	valid_0's rmse: 0.913343
[120]	valid_0's rmse: 0.912682
[130]	valid_0's rmse: 0.911556
[140]	valid_0's rmse: 0.910813
[150]	valid_0's rmse: 0.909541
[160]	valid_0's rmse: 0.908949
[170]	valid_0's rmse: 0.908262
[180]	valid_0's rmse: 0.907476
[190]	valid_0's rmse: 0.907028
[200]	valid_0's rmse: 0.906551
[210]	valid_0's rmse: 0.906068
[220]	valid_0's rmse: 0.905847
[230]	valid_0's rmse: 0.905359
[240]	valid_0's rmse: 0.905108
[250]	valid_0's rmse: 0.904915
[260]	valid_0's rmse: 0.904569
[270]	valid_0's rmse: 0.904217
[280]	valid_0's rmse: 0.903612
[290]	valid_0's rmse: 0.903149
[300]	valid_0's rmse: 0.902973
[310]	valid_0's rms

[('comp6_lag_1', 539),
 ('comp4_lag_1', 459),
 ('comp8_lag_1', 446),
 ('category_cnt_block_lag_1', 428),
 ('comp10_lag_1', 373),
 ('comp3_lag_1', 372),
 ('item_cnt_block_lag_1', 345),
 ('comp5_lag_1', 329),
 ('item_cnt_block_mean_lag_1', 295),
 ('comp14_lag_1', 202)]

In [48]:
features = [
  #'item_id', 'shop_id','item_category_id',
    
    'item_cnt_block_lag_1',
       'item_cnt_block_lag_2', 'item_cnt_block_lag_3',
        'item_cnt_block_lag_6',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_2',
       'item_cnt_block_mean_lag_3', 
       'item_cnt_block_mean_lag_6', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_2', 'shop_cnt_block_lag_3',
   'shop_cnt_block_lag_6',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_2',
       'shop_cnt_block_mean_lag_3', 
       'shop_cnt_block_mean_lag_6', 'category_cnt_block_lag_1',
       'category_cnt_block_lag_2', 'category_cnt_block_lag_3',
 'category_cnt_block_lag_6',
       'category_cnt_block_mean_lag_1', 'category_cnt_block_mean_lag_2',
       'category_cnt_block_mean_lag_3', 
       'category_cnt_block_mean_lag_6', 'shop_category_cnt_block_lag_1',
       'shop_category_cnt_block_lag_2', 'shop_category_cnt_block_lag_3',
  'shop_category_cnt_block_lag_6',
       'shop_category_cnt_block_mean_lag_1',
       'shop_category_cnt_block_mean_lag_2',
       'shop_category_cnt_block_mean_lag_3',
    
       'shop_category_cnt_block_mean_lag_6'
    
]

In [52]:
cb_model = CatBoostRegressor(iterations=6000,
                             #learning_rate=0.05, #default is 0.03
                             objective='RMSE',
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             early_stopping_rounds = 10,
                             #border_count=32, #number of splits for num features (default 128 on GPU)
                             #bagging_temperature = 30, #default 1 intensity of bootstrap
                             #l2_leaf_reg = 300, #default 3 seems useless
                             random_strength = 20,#default 1  adds randomness to the split score
                             depth=4,  #default 6
                             #rsm=0.7, #feature %age default 1 no GPU
                             #cat_features=[0,1],
                            #learning_rate=0.001, #default is 0.03
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.feature_importances_):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.1121502	test: 1.1118709	best: 1.1118709 (0)	total: 23.3ms	remaining: 2m 19s
1:	learn: 1.1028474	test: 1.1045458	best: 1.1045458 (1)	total: 46.6ms	remaining: 2m 19s
2:	learn: 1.0939784	test: 1.0976920	best: 1.0976920 (2)	total: 70.7ms	remaining: 2m 21s
3:	learn: 1.0853914	test: 1.0910732	best: 1.0910732 (3)	total: 94.1ms	remaining: 2m 21s
4:	learn: 1.0772810	test: 1.0848343	best: 1.0848343 (4)	total: 117ms	remaining: 2m 20s
5:	learn: 1.0695833	test: 1.0791387	best: 1.0791387 (5)	total: 140ms	remaining: 2m 19s
6:	learn: 1.0621558	test: 1.0735413	best: 1.0735413 (6)	total: 163ms	remaining: 2m 19s
7:	learn: 1.0548866	test: 1.0683935	best: 1.0683935 (7)	total: 187ms	remaining: 2m 19s
8:	learn: 1.0480310	test: 1.0635750	best: 1.0635750 (8)	total: 209ms	remaining: 2m 19s
9:	learn: 1.0421231	test: 1.0589797	best: 1.0589797 (9)	total: 231ms	remaining: 2m 18s
10:	learn: 1.0358110	test: 1.0543702	best: 1.0543702 (10)	total: 256ms	remaining: 2m 19s
11:	learn: 1.0298580	test: 1.0496371	

99:	learn: 0.8819699	test: 0.9366251	best: 0.9366251 (99)	total: 2.3s	remaining: 2m 15s
100:	learn: 0.8816819	test: 0.9363314	best: 0.9363314 (100)	total: 2.33s	remaining: 2m 15s
101:	learn: 0.8813400	test: 0.9358792	best: 0.9358792 (101)	total: 2.35s	remaining: 2m 16s
102:	learn: 0.8810169	test: 0.9357267	best: 0.9357267 (102)	total: 2.38s	remaining: 2m 16s
103:	learn: 0.8806721	test: 0.9354495	best: 0.9354495 (103)	total: 2.4s	remaining: 2m 16s
104:	learn: 0.8802794	test: 0.9349241	best: 0.9349241 (104)	total: 2.42s	remaining: 2m 16s
105:	learn: 0.8799630	test: 0.9346748	best: 0.9346748 (105)	total: 2.45s	remaining: 2m 16s
106:	learn: 0.8796553	test: 0.9344625	best: 0.9344625 (106)	total: 2.48s	remaining: 2m 16s
107:	learn: 0.8792789	test: 0.9340111	best: 0.9340111 (107)	total: 2.5s	remaining: 2m 16s
108:	learn: 0.8790157	test: 0.9335987	best: 0.9335987 (108)	total: 2.52s	remaining: 2m 16s
109:	learn: 0.8788425	test: 0.9334704	best: 0.9334704 (109)	total: 2.55s	remaining: 2m 16s
110:

197:	learn: 0.8601984	test: 0.9158501	best: 0.9158501 (197)	total: 4.62s	remaining: 2m 15s
198:	learn: 0.8601098	test: 0.9157819	best: 0.9157819 (198)	total: 4.64s	remaining: 2m 15s
199:	learn: 0.8600203	test: 0.9157219	best: 0.9157219 (199)	total: 4.67s	remaining: 2m 15s
200:	learn: 0.8599131	test: 0.9156783	best: 0.9156783 (200)	total: 4.69s	remaining: 2m 15s
201:	learn: 0.8597621	test: 0.9155454	best: 0.9155454 (201)	total: 4.71s	remaining: 2m 15s
202:	learn: 0.8596293	test: 0.9153892	best: 0.9153892 (202)	total: 4.74s	remaining: 2m 15s
203:	learn: 0.8594639	test: 0.9153220	best: 0.9153220 (203)	total: 4.76s	remaining: 2m 15s
204:	learn: 0.8592524	test: 0.9151974	best: 0.9151974 (204)	total: 4.78s	remaining: 2m 15s
205:	learn: 0.8591885	test: 0.9151730	best: 0.9151730 (205)	total: 4.8s	remaining: 2m 15s
206:	learn: 0.8590100	test: 0.9149820	best: 0.9149820 (206)	total: 4.83s	remaining: 2m 15s
207:	learn: 0.8589014	test: 0.9148663	best: 0.9148663 (207)	total: 4.86s	remaining: 2m 15s


291:	learn: 0.8488081	test: 0.9061080	best: 0.9061080 (291)	total: 6.92s	remaining: 2m 15s
292:	learn: 0.8487300	test: 0.9061694	best: 0.9061080 (291)	total: 6.95s	remaining: 2m 15s
293:	learn: 0.8486793	test: 0.9061351	best: 0.9061080 (291)	total: 6.97s	remaining: 2m 15s
294:	learn: 0.8486127	test: 0.9061502	best: 0.9061080 (291)	total: 7s	remaining: 2m 15s
295:	learn: 0.8485167	test: 0.9060024	best: 0.9060024 (295)	total: 7.02s	remaining: 2m 15s
296:	learn: 0.8484622	test: 0.9059621	best: 0.9059621 (296)	total: 7.05s	remaining: 2m 15s
297:	learn: 0.8484221	test: 0.9059516	best: 0.9059516 (297)	total: 7.07s	remaining: 2m 15s
298:	learn: 0.8483466	test: 0.9059627	best: 0.9059516 (297)	total: 7.09s	remaining: 2m 15s
299:	learn: 0.8483317	test: 0.9059638	best: 0.9059516 (297)	total: 7.12s	remaining: 2m 15s
300:	learn: 0.8482778	test: 0.9059365	best: 0.9059365 (300)	total: 7.15s	remaining: 2m 15s
301:	learn: 0.8482523	test: 0.9059143	best: 0.9059143 (301)	total: 7.17s	remaining: 2m 15s
30

388:	learn: 0.8422640	test: 0.9009373	best: 0.9009373 (388)	total: 9.21s	remaining: 2m 12s
389:	learn: 0.8421597	test: 0.9008097	best: 0.9008097 (389)	total: 9.23s	remaining: 2m 12s
390:	learn: 0.8421047	test: 0.9007242	best: 0.9007242 (390)	total: 9.26s	remaining: 2m 12s
391:	learn: 0.8420748	test: 0.9007250	best: 0.9007242 (390)	total: 9.28s	remaining: 2m 12s
392:	learn: 0.8419803	test: 0.9004920	best: 0.9004920 (392)	total: 9.3s	remaining: 2m 12s
393:	learn: 0.8418949	test: 0.9003417	best: 0.9003417 (393)	total: 9.32s	remaining: 2m 12s
394:	learn: 0.8418649	test: 0.9003304	best: 0.9003304 (394)	total: 9.35s	remaining: 2m 12s
395:	learn: 0.8418041	test: 0.9003372	best: 0.9003304 (394)	total: 9.37s	remaining: 2m 12s
396:	learn: 0.8417146	test: 0.9002327	best: 0.9002327 (396)	total: 9.39s	remaining: 2m 12s
397:	learn: 0.8416554	test: 0.9002083	best: 0.9002083 (397)	total: 9.42s	remaining: 2m 12s
398:	learn: 0.8416137	test: 0.9002036	best: 0.9002036 (398)	total: 9.44s	remaining: 2m 12s


487:	learn: 0.8368676	test: 0.8958241	best: 0.8958241 (487)	total: 11.5s	remaining: 2m 9s
488:	learn: 0.8368257	test: 0.8958035	best: 0.8958035 (488)	total: 11.5s	remaining: 2m 9s
489:	learn: 0.8367995	test: 0.8957911	best: 0.8957911 (489)	total: 11.5s	remaining: 2m 9s
490:	learn: 0.8367300	test: 0.8957089	best: 0.8957089 (490)	total: 11.6s	remaining: 2m 9s
491:	learn: 0.8366420	test: 0.8955846	best: 0.8955846 (491)	total: 11.6s	remaining: 2m 9s
492:	learn: 0.8366072	test: 0.8955763	best: 0.8955763 (492)	total: 11.6s	remaining: 2m 9s
493:	learn: 0.8365746	test: 0.8955664	best: 0.8955664 (493)	total: 11.6s	remaining: 2m 9s
494:	learn: 0.8365135	test: 0.8954535	best: 0.8954535 (494)	total: 11.6s	remaining: 2m 9s
495:	learn: 0.8364530	test: 0.8953643	best: 0.8953643 (495)	total: 11.7s	remaining: 2m 9s
496:	learn: 0.8363272	test: 0.8952082	best: 0.8952082 (496)	total: 11.7s	remaining: 2m 9s
497:	learn: 0.8362608	test: 0.8951430	best: 0.8951430 (497)	total: 11.7s	remaining: 2m 9s
498:	learn

585:	learn: 0.8324083	test: 0.8925925	best: 0.8925679 (584)	total: 13.8s	remaining: 2m 7s
586:	learn: 0.8323727	test: 0.8925916	best: 0.8925679 (584)	total: 13.8s	remaining: 2m 7s
587:	learn: 0.8323142	test: 0.8925338	best: 0.8925338 (587)	total: 13.8s	remaining: 2m 7s
588:	learn: 0.8322915	test: 0.8925760	best: 0.8925338 (587)	total: 13.9s	remaining: 2m 7s
589:	learn: 0.8322137	test: 0.8924616	best: 0.8924616 (589)	total: 13.9s	remaining: 2m 7s
590:	learn: 0.8321789	test: 0.8924569	best: 0.8924569 (590)	total: 13.9s	remaining: 2m 7s
591:	learn: 0.8321521	test: 0.8924553	best: 0.8924553 (591)	total: 13.9s	remaining: 2m 7s
592:	learn: 0.8321249	test: 0.8924382	best: 0.8924382 (592)	total: 14s	remaining: 2m 7s
593:	learn: 0.8320872	test: 0.8924251	best: 0.8924251 (593)	total: 14s	remaining: 2m 7s
594:	learn: 0.8320797	test: 0.8924291	best: 0.8924251 (593)	total: 14s	remaining: 2m 7s
595:	learn: 0.8320457	test: 0.8924257	best: 0.8924251 (593)	total: 14s	remaining: 2m 7s
596:	learn: 0.8320

[('item_cnt_block_lag_1', 52.10013201223478),
 ('shop_category_cnt_block_lag_1', 12.4610963718602),
 ('category_cnt_block_lag_1', 8.630020685808482),
 ('shop_cnt_block_lag_1', 7.405411005458768),
 ('item_cnt_block_lag_3', 3.575632532950103),
 ('item_cnt_block_lag_2', 3.33958363763778),
 ('shop_cnt_block_lag_2', 2.705588270975913),
 ('shop_category_cnt_block_lag_2', 2.426807482613368),
 ('shop_cnt_block_lag_3', 1.9550510431047778),
 ('shop_cnt_block_lag_6', 1.5463854511130513),
 ('item_cnt_block_lag_6', 1.5210970624431304),
 ('shop_category_cnt_block_lag_3', 1.431290377872729),
 ('shop_category_cnt_block_lag_6', 0.9019040659269432)]

In [50]:
features = [item[0] for item in scores.items() if item[1] > 1]

In [53]:
features

['item_cnt_block_lag_1',
 'item_cnt_block_lag_2',
 'item_cnt_block_lag_3',
 'item_cnt_block_lag_6',
 'shop_cnt_block_lag_1',
 'shop_cnt_block_lag_2',
 'shop_cnt_block_lag_3',
 'shop_cnt_block_lag_6',
 'category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_2',
 'shop_category_cnt_block_lag_3',
 'shop_category_cnt_block_lag_6']

In [73]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [74]:
train = training[training['date_block_num'] == 33]

In [75]:
cols = ['shop_id','item_id']

test = test.merge(train, on=cols,how='left')

In [57]:
len(test)

214200

In [106]:
features = ['item_cnt_block_lag_1',
 'item_cnt_block_mean_lag_1',
 'category_cnt_block_lag_1',
 'comp3_lag_1',
 'comp4_lag_1',
 'comp5_lag_1',
 'comp6_lag_1',
 'comp8_lag_1',
 'comp10_lag_1',
 'comp14_lag_1']

In [34]:
features = [
    
'item_cnt_block_lag_1',
 'item_cnt_block_lag_2',
 'item_cnt_block_lag_3',
 'item_cnt_block_lag_6',
 'shop_cnt_block_lag_1',
 'shop_cnt_block_lag_2',
 'shop_cnt_block_lag_3',
 'shop_cnt_block_lag_6',
 'category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_2',
 'shop_category_cnt_block_lag_3',
 'shop_category_cnt_block_lag_6'
    
]

In [79]:

test.drop(columns=['item_cnt_block_lag_1'],inplace=True)
test.rename(columns={"item_cnt_block":"item_cnt_block_lag_1"},inplace=True)

test.drop(columns=['item_cnt_block_mean_lag_1'],inplace=True)
test.rename(columns={"item_cnt_block_mean":"item_cnt_block_mean_lag_1"},inplace=True)

test.drop(columns=['category_cnt_block_lag_1'],inplace=True)
test.rename(columns={"category_cnt_block":"category_cnt_block_lag_1"},inplace=True)

test.drop(columns=['comp3_lag_1'],inplace=True)
test.rename(columns={"comp3":"comp3_lag_1"},inplace=True)

test.drop(columns=['comp4_lag_1'],inplace=True)
test.rename(columns={"comp4":"comp4_lag_1"},inplace=True)

test.drop(columns=['comp5_lag_1'],inplace=True)
test.rename(columns={"comp5":"comp5_lag_1"},inplace=True)

test.drop(columns=['comp6_lag_1'],inplace=True)
test.rename(columns={"comp6":"comp6_lag_1"},inplace=True)

test.drop(columns=['comp8_lag_1'],inplace=True)
test.rename(columns={"comp8":"comp8_lag_1"},inplace=True)

test.drop(columns=['comp10_lag_1'],inplace=True)
test.rename(columns={"comp10":"comp10_lag_1"},inplace=True)

test.drop(columns=['comp14_lag_1'],inplace=True)
test.rename(columns={"comp14":"comp14_lag_1"},inplace=True)

In [None]:
'item_cnt_block_lag_1',
 'item_cnt_block_lag_2',
 'item_cnt_block_lag_3',
 'item_cnt_block_lag_6',
 'shop_cnt_block_lag_1',
 'shop_cnt_block_lag_2',
 'shop_cnt_block_lag_3',
 'shop_cnt_block_lag_6',
 'category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_1',
 'shop_category_cnt_block_lag_2',
 'shop_category_cnt_block_lag_3',
 'shop_category_cnt_block_lag_6'

In [76]:
test['item_cnt_block_lag_1_'] = test['item_cnt_block_lag_1']
test.drop(columns=['item_cnt_block_lag_1'],inplace=True)
test.rename(columns={"item_cnt_block":"item_cnt_block_lag_1"},inplace=True)

test['item_cnt_block_lag_2_'] = test['item_cnt_block_lag_2']
test.drop(columns=['item_cnt_block_lag_2'],inplace=True)
test.rename(columns={"item_cnt_block_lag_1_":"item_cnt_block_lag_2"},inplace=True)

test.drop(columns=['item_cnt_block_lag_3'],inplace=True)
test.rename(columns={"item_cnt_block_lag_2_":"item_cnt_block_lag_3"},inplace=True)

test.drop(columns=['item_cnt_block_lag_6'],inplace=True)
test.rename(columns={"item_cnt_block_lag_5":"item_cnt_block_lag_6"},inplace=True)
#
test['shop_cnt_block_lag_1_'] = test['shop_cnt_block_lag_1']
test.drop(columns=['shop_cnt_block_lag_1'],inplace=True)
test.rename(columns={"shop_cnt_block":"shop_cnt_block_lag_1"},inplace=True)

test.drop(columns=['shop_cnt_block_lag_3'],inplace=True)
test.rename(columns={"shop_cnt_block_lag_2":"shop_cnt_block_lag_3"},inplace=True)

test.rename(columns={"shop_cnt_block_lag_1_":"shop_cnt_block_lag_2"},inplace=True)

test.drop(columns=['shop_cnt_block_lag_6'],inplace=True)
test.rename(columns={"shop_cnt_block_lag_5":"shop_cnt_block_lag_6"},inplace=True)
#
test['category_cnt_block_lag_1_'] = test['category_cnt_block_lag_1']
test.drop(columns=['category_cnt_block_lag_1'],inplace=True)
test.rename(columns={"category_cnt_block":"category_cnt_block_lag_1"},inplace=True)
#
test.drop(columns=['category_cnt_block_lag_2'],inplace=True)
test.rename(columns={"category_cnt_block_lag_1_":"category_cnt_block_lag_2"},inplace=True)
#
test['shop_category_cnt_block_lag_1_'] = test['shop_category_cnt_block_lag_1']
test.drop(columns=['shop_category_cnt_block_lag_1'],inplace=True)
test.rename(columns={"shop_category_cnt_block":"shop_category_cnt_block_lag_1"},inplace=True)

test['shop_category_cnt_block_lag_2_'] = test['shop_category_cnt_block_lag_2']
test.drop(columns=['shop_category_cnt_block_lag_2'],inplace=True)
test.rename(columns={"shop_category_cnt_block_lag_1_":"shop_category_cnt_block_lag_2"},inplace=True)

test.drop(columns=['shop_category_cnt_block_lag_6'],inplace=True)
test.rename(columns={"shop_category_cnt_block_lag_5":"shop_category_cnt_block_lag_6"},inplace=True)
#

In [59]:
preds = model_lgb.predict(test[features])
preds.clip(0,20,out=preds)

NameError: name 'model_lgb' is not defined

In [67]:
test.columns.values

array(['item_id', 'ID', 'shop_id', 'item_category_id_x', 'date_block_num',
       'shop_item_cnt_block', 'item_category_id_y', 'month', 'year',
       'item_cnt_block_lag_1', 'shop_cnt_block_lag_1',
       'category_cnt_block_lag_1', 'shop_category_cnt_block_lag_1',
       'item_price', 'shop_cat', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'shop_cat_mean_encoding', 'item_cnt_block_mean',
       'shop_cnt_block_mean', 'category_cnt_block_mean',
       'shop_category_cnt_block_mean', 'item_cnt_block_lag_6',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_2',
       'item_cnt_block_mean_lag_3', 'item_cnt_block_mean_lag_5',
       'item_cnt_block_mean_lag_6', 'shop_cnt_block_lag_6',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_2',
       'shop_cnt_block_mean_lag_3', 'shop_cnt_block_mean_lag_5',
       'shop_cnt_block_mean_lag_6', 'category_cnt_block_lag_3',
       'category_cnt_block_lag_5', 'category_cnt_

In [77]:
preds = cb_model.predict(test[features])
preds.clip(0,20,out=preds)

array([0.05644767, 0.03440845, 0.10098581, ..., 0.15809136, 0.25325275,
       0.21655037])

In [78]:

print(np.mean(preds))
print(np.max(preds))

submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds

submission.to_csv('submission.csv', index=False)

0.22843946230750115
20.0


In [None]:
bestpreds = pd.read_csv('submissionbest.csv')['item_cnt_month']
print(np.mean(bestpreds))
print(np.max(bestpreds))

In [104]:
lgbm_preds = pd.read_csv('lgbm102.csv')['item_cnt_month']
lstm_preds = pd.read_csv('lstm104.csv')['item_cnt_month']
lr_preds = pd.read_csv('lr111.csv')['item_cnt_month']
cb_preds = pd.read_csv('cb102.csv')['item_cnt_month']



#preds = np.mean(np.array([lr_preds, lg_preds]),axis=0)

#preds = (lstm_preds * 0.2) + (lgbm_preds * 0.6) + (lr_preds*0.2) #this gives 1.08
#preds = (lgbm_preds * 0.8) + (lr_preds*0.2) # this gives 1.04
preds = (cb_preds * 0.5) + (lgbm_preds*0.5) # this gave 0.98 :)))))))