In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import gc
import pickle as pickle
import lightgbm as lgbm


from sklearn.model_selection import KFold
from itertools import product

In [6]:
gc.collect()

items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)


In [7]:
sales_train = sales_train[sales_train['year'].isin([2013]) == False]

sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [8]:
train_item_ids = sales_train['item_id'].unique()
#train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
#train_item_ids = ids_keep
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

#all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_item_ids = test_item_ids

#all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))
all_shop_ids = test_shop_ids



In [9]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    #item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    item_ids = all_item_ids
    #dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    dbn_combos = list(product(all_shop_ids, item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [10]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['shop_item_cnt_block'] = training['shop_item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [11]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')
training['year'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['year']), downcast='unsigned')



In [12]:
ys = sales_train.groupby(['item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = training.merge(ys, on=['item_id', 'date_block_num'], how='left').fillna(0)

ys = sales_train.groupby(['shop_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_cnt_block"})

training = training.merge(ys, on=['shop_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"category_cnt_block"})


training = training.merge(ys, on=['item_category_id', 'date_block_num'], how='left').fillna(0)


ys = sales_train.groupby(['shop_id', 'item_category_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"shop_category_cnt_block"})

training = training.merge(ys, on=['shop_id', 'item_category_id', 'date_block_num'], how='left').fillna(0)


In [13]:
mean_prices = sales_train.groupby(['item_id','date_block_num'])['item_price'].mean().reset_index()
training = training.merge(mean_prices, on=['item_id','date_block_num'], how='left')

In [14]:
training["shop_cat"] = training["shop_id"].astype(str) + "_" + training["item_category_id"].astype(str)

In [15]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
columns = ["item_id", "shop_id", "item_category_id", "shop_cat"]



y_train = training["shop_item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['shop_item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [16]:
training['item_cnt_block_mean'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.mean)
#training['item_cnt_block_min'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.min)
#training['item_cnt_block_max'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.max)
#training['item_cnt_block_std'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.std)
#training['item_cnt_block_med'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.median)

training['shop_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.mean)
#training['shop_cnt_block_min'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.min)
#training['shop_cnt_block_max'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.max)
#training['shop_cnt_block_std'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.std)
#training['shop_cnt_block_med'] = training.groupby(['date_block_num'])['shop_cnt_block'].transform(np.median)

training['category_cnt_block_mean'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.mean)
#training['category_cnt_block_min'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.min)
#training['category_cnt_block_max'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.max)
#training['category_cnt_block_std'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.std)
#training['category_cnt_block_med'] = training.groupby(['date_block_num'])['category_cnt_block'].transform(np.median)

training['shop_category_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.mean)
#training['shop_category_cnt_block_min'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.min)
#training['shop_category_cnt_block_max'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.max)
#training['shop_category_cnt_block_std'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.std)
#training['shop_category_cnt_block_med'] = training.groupby(['date_block_num'])['shop_category_cnt_block'].transform(np.median)

#training['shop_item_cnt_block_mean'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.mean)
#training['shop_item_cnt_block_min'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.min)
#training['shop_item_cnt_block_max'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.max)
#training['shop_item_cnt_block_std'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.std)
#training['shop_item_cnt_block_med'] = training.groupby(['date_block_num'])['shop_item_cnt_block'].transform(np.median)

In [62]:
training['comp1'] = training['item_cnt_block'] * training['shop_cnt_block']
training['comp2'] = training['item_cnt_block'] / training['shop_cnt_block']

training['comp3'] = training['item_cnt_block'] * training['item_id_mean_encoding']
training['comp4'] = training['item_cnt_block'] / training['item_id_mean_encoding']

training['comp5'] =  training['shop_cnt_block'] / training['item_cnt_block']
training['comp6'] = training['item_id_mean_encoding'] / training['item_cnt_block']

training['comp7'] = training['item_cnt_block'] * training['shop_category_cnt_block']
training['comp8'] = training['item_cnt_block'] / training['shop_category_cnt_block']

training['comp9'] = training['item_cnt_block'] * training['shop_cat_mean_encoding']
training['comp10'] = training['item_cnt_block'] / training['shop_cat_mean_encoding']

training['comp11'] = training['shop_cnt_block'] * training['shop_id_mean_encoding']
training['comp12'] = training['shop_cnt_block'] / training['shop_id_mean_encoding']


In [None]:
def add_lags(df, cols, name, lags = [1,2,3,5,6]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block')
training = add_lags(training, ['item_id','date_block_num'], 'item_cnt_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_cnt_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block')
training = add_lags(training, ['item_category_id','date_block_num'], 'category_cnt_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_category_cnt_block_mean')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp1')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp2')
training = add_lags(training, ['item_id','date_block_num'], 'comp3')
training = add_lags(training, ['item_id','date_block_num'], 'comp4')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp5')
training = add_lags(training, ['item_id','date_block_num'], 'comp6')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp7')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp8')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp9')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'comp10')
training = add_lags(training, ['shop_id','date_block_num'], 'comp11')
training = add_lags(training, ['shop_id','date_block_num'], 'comp12')


item_cnt_block 1
item_cnt_block 2
item_cnt_block 3
item_cnt_block 5
item_cnt_block 6
item_cnt_block_mean 1
item_cnt_block_mean 2
item_cnt_block_mean 3
item_cnt_block_mean 5
item_cnt_block_mean 6
shop_cnt_block 1
shop_cnt_block 2
shop_cnt_block 3
shop_cnt_block 5
shop_cnt_block 6
shop_cnt_block_mean 1
shop_cnt_block_mean 2
shop_cnt_block_mean 3
shop_cnt_block_mean 5
shop_cnt_block_mean 6
category_cnt_block 1
category_cnt_block 2
category_cnt_block 3
category_cnt_block 5
category_cnt_block 6
category_cnt_block_mean 1
category_cnt_block_mean 2
category_cnt_block_mean 3
category_cnt_block_mean 5
category_cnt_block_mean 6
shop_category_cnt_block 1
shop_category_cnt_block 2
shop_category_cnt_block 3
shop_category_cnt_block 5
shop_category_cnt_block 6
shop_category_cnt_block_mean 1
shop_category_cnt_block_mean 2
shop_category_cnt_block_mean 3
shop_category_cnt_block_mean 5
shop_category_cnt_block_mean 6
comp1 1
comp1 2
comp1 3
comp1 5
comp1 6


In [19]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'shop_item_cnt_block',
       'item_category_id', 'month', 'year', 'item_cnt_block',
       'shop_cnt_block', 'category_cnt_block', 'shop_category_cnt_block',
       'item_price', 'shop_cat', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'shop_cat_mean_encoding', 'item_cnt_block_mean',
       'shop_cnt_block_mean', 'category_cnt_block_mean',
       'shop_category_cnt_block_mean', 'item_cnt_block_lag_1',
       'item_cnt_block_lag_3', 'item_cnt_block_lag_6',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_3',
       'item_cnt_block_mean_lag_6', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_3', 'shop_cnt_block_lag_6',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_3',
       'shop_cnt_block_mean_lag_6', 'category_cnt_block_lag_1',
       'category_cnt_block_lag_3', 'category_cnt_block_lag_6',
       'category_cnt_block_mean_lag_1', 'category_cnt_block_mean_la

In [20]:
training.fillna(0,inplace=True)

In [51]:
features = [
     'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'shop_cat_mean_encoding', 
    'item_cnt_block_lag_1',
       'item_cnt_block_lag_3', 'item_cnt_block_lag_6',
       'item_cnt_block_mean_lag_1', 'item_cnt_block_mean_lag_3',
       'item_cnt_block_mean_lag_6', 'shop_cnt_block_lag_1',
       'shop_cnt_block_lag_3', 'shop_cnt_block_lag_6',
       'shop_cnt_block_mean_lag_1', 'shop_cnt_block_mean_lag_3',
       'shop_cnt_block_mean_lag_6', 'category_cnt_block_lag_1',
       'category_cnt_block_lag_3', 'category_cnt_block_lag_6',
       'category_cnt_block_mean_lag_1', 'category_cnt_block_mean_lag_3',
       'category_cnt_block_mean_lag_6', 'shop_category_cnt_block_lag_1',
       'shop_category_cnt_block_lag_3', 'shop_category_cnt_block_lag_6',
       'shop_category_cnt_block_mean_lag_1',
       'shop_category_cnt_block_mean_lag_3',
       'shop_category_cnt_block_mean_lag_6', 
    #'comp1', 'comp2', 'comp3',
     #  'comp4', 'comp5', 'comp6', 'comp7', 'comp8', 'comp9', 'comp10',
      # 'comp11', 'comp12'
    
]

In [23]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,shop_item_cnt_block,item_category_id,month,year,item_cnt_block,shop_cnt_block,category_cnt_block,shop_category_cnt_block,item_price,shop_cat,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,shop_cat_mean_encoding,item_cnt_block_mean,shop_cnt_block_mean,category_cnt_block_mean,shop_category_cnt_block_mean,item_cnt_block_lag_1,item_cnt_block_lag_3,item_cnt_block_lag_6,item_cnt_block_mean_lag_1,item_cnt_block_mean_lag_3,item_cnt_block_mean_lag_6,shop_cnt_block_lag_1,shop_cnt_block_lag_3,shop_cnt_block_lag_6,shop_cnt_block_mean_lag_1,shop_cnt_block_mean_lag_3,shop_cnt_block_mean_lag_6,category_cnt_block_lag_1,category_cnt_block_lag_3,category_cnt_block_lag_6,category_cnt_block_mean_lag_1,category_cnt_block_mean_lag_3,category_cnt_block_mean_lag_6,shop_category_cnt_block_lag_1,shop_category_cnt_block_lag_3,shop_category_cnt_block_lag_6,shop_category_cnt_block_mean_lag_1,shop_category_cnt_block_mean_lag_3,shop_category_cnt_block_mean_lag_6,comp1,comp2,comp3,comp4,comp5,comp6,comp7,comp8,comp9,comp10,comp11,comp12
1977808,9400,34,20,0,70,9,2014,14.0,406.0,587.0,8.0,299.0,34_70,0.218792,0.044632,0.207842,0.041716,11.554706,1935.5,4584.067843,89.16923,44,13,0,12.728039,10.760589,11.849607,368,0,0,1996.285767,1878.785767,2216.380859,731,1013,615,5577.77002,5413.624512,6547.391113,5,0,0,108.448822,104.024025,125.359169,16192,0.119565,9.626846,201.104294,8.363636,0.004973,220,8.8,1.835518,1054.742857,16.424613,8245.186703
774513,3692,15,15,0,22,4,2014,0.0,1671.0,545.0,6.0,0.0,15_22,0.569083,0.222175,0.325172,0.423423,9.003333,1842.166667,5550.332157,106.891709,0,0,0,11.849607,10.266078,0.0,2144,2460,0,2216.380859,2220.047607,0.0,655,878,0,6547.391113,6563.625,0.0,11,22,0,125.359169,123.881836,0.0,0,0.0,0.0,0.0,inf,inf,0,0.0,0.0,0.0,476.343611,9650.042314
2571549,12473,4,25,0,55,2,2015,19.0,980.0,8036.0,108.0,199.0,4_55,0.31016,0.143118,0.206754,0.158536,12.527647,1716.571429,4371.776275,88.835056,17,18,27,16.166079,16.610001,12.728039,1188,1371,1387,2176.380859,2298.428467,1996.285767,9291,9809,11180,5477.754395,5516.103027,5577.77002,135,145,217,107.25856,106.721191,108.448822,20196,0.01431,5.272727,54.810345,69.882353,0.018245,2295,0.125926,2.695104,107.231467,170.023846,8300.859186
3821259,17544,37,25,0,37,2,2015,0.0,988.0,4847.0,67.0,0.0,37_37,0.028571,0.145393,0.14027,0.095902,12.527647,1716.571429,4371.776275,88.835056,0,0,0,16.166079,16.610001,12.728039,1144,1496,1240,2176.380859,2298.428467,1996.285767,5959,5185,5992,5477.754395,5516.103027,5577.77002,51,44,112,107.25856,106.721191,108.448822,0,0.0,0.0,0.0,inf,inf,0,0.0,0.0,0.0,166.329627,7868.327618
4706230,22145,21,24,0,37,1,2015,0.0,1923.0,5959.0,171.0,0.0,21_37,0.286479,0.234929,0.14027,0.242715,16.166078,2176.380952,5477.754314,107.258562,1,2,4,24.733137,13.808432,10.62549,3959,2031,1603,3318.5,1926.547607,1769.357178,7582,5074,5647,7494.178711,5176.736328,5166.635254,279,169,160,148.356445,96.386688,99.136688,3959,0.000253,0.286479,3.490654,3959.0,0.286479,279,0.003584,0.242715,4.120056,930.08461,16851.887269
158361,979,25,17,0,37,6,2014,0.0,6405.0,6189.0,483.0,0.0,25_37,0.096306,0.665948,0.14095,0.591002,10.760588,1878.785714,5413.62451,104.024024,0,0,0,10.428627,11.849607,0.0,5746,6826,0,1873.214233,2216.380859,0.0,7177,6322,0,5214.182129,6547.391113,0.0,454,466,0,99.610313,125.359169,0.0,0,0.0,0.0,0.0,inf,inf,0,0.0,0.0,0.0,3826.539761,8628.295552
4029669,18622,6,29,0,40,6,2015,13.0,1539.0,9304.0,198.0,169.0,6_40,0.809524,0.282582,0.157103,0.163332,10.833333,1430.904762,3318.272157,74.266709,23,32,39,11.949804,12.332941,24.733137,1748,1981,4115,1592.166626,1701.571411,3318.5,9208,13084,18876,3428.631348,4584.216309,7494.178711,172,302,383,75.539597,94.878365,148.356445,40204,0.013158,18.619048,28.411765,76.0,0.035197,3956,0.133721,3.756633,140.817604,493.953239,6185.81631
2548948,12361,39,18,1,40,7,2014,17.0,699.0,14452.0,59.0,144.0,39_40,0.673414,0.097317,0.157499,0.037149,10.62549,1769.357143,5166.635294,99.13669,137,29,49,10.760589,9.003333,10.266078,734,691,0,1878.785767,1842.166626,2220.047607,16064,18320,22065,5413.624512,5550.332031,6563.625,68,89,0,104.024025,106.891708,123.881836,100558,0.186649,92.25776,203.440882,5.357664,0.004915,9316,2.014706,5.089408,3687.855159,71.430985,7542.329134
2981527,13719,47,33,0,69,10,2015,11.0,1776.0,888.0,20.0,229.0,47_69,0.26387,0.269166,0.099091,0.150152,13.289608,1553.785714,3228.865882,73.363525,16,36,31,11.648627,10.808824,12.53549,1926,1700,1813,1719.523804,1427.642822,1711.166626,665,1124,704,2829.166992,3335.517822,4010.790771,27,37,24,66.414055,75.583504,88.694931,30816,0.008307,4.221922,60.635897,120.375,0.016492,432,0.592593,2.402432,106.558704,518.413548,7155.437995
1518521,6996,26,27,0,20,4,2015,0.0,1527.0,4768.0,123.0,0.0,26_20,0.0,0.195545,0.608127,0.663236,12.53549,1711.166667,4010.790784,88.694935,0,0,0,12.332941,16.166079,13.808432,1486,1715,1900,1701.571411,2176.380859,1926.547607,5399,4969,6063,4584.216309,5477.754395,5176.736328,197,104,111,94.878365,107.25856,96.386688,0,0.0,0.0,0.0,inf,0.0,0,0.0,0.0,0.0,290.580603,7599.254663


In [32]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['shop_item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['shop_item_cnt_block']

#pos_val_len = len(y_val[y_val != 0])
#print("pos_val_len", pos_val_len)

#zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
#print("zeros_keep_indices_val", len(zeros_keep_indices_val))
#non_zeros_val_indices = y_val[y_val != 0].index
#print("non_zeros_val_indices", len(non_zeros_val_indices))

#val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

#y_val = y_val.loc[val_indices]
#x_val = x_val.loc[val_indices]

In [60]:
lgtrain = lgbm.Dataset(x_train[features], label=y_train)
lgval = lgbm.Dataset(x_val[features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 8,
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        "learning_rate" : 0.1,
        "min_data_in_leaf": 5000,
        #"num_leaves": 29,
        "max_depth" : 3,
        #"bagging_fraction": 0.4,
        #"bagging_freq": 1,
        #"feature_fraction": 0.68,
        #"lambda_l1": 10,
}

evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 10000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=10, 
                      verbose_eval=10, 
                      evals_result=evals_result)


scores = {}
for i,score in enumerate(model_lgb.feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

Training until validation scores don't improve for 10 rounds.
[10]	valid_0's rmse: 0.971402
[20]	valid_0's rmse: 0.944848
[30]	valid_0's rmse: 0.935816
[40]	valid_0's rmse: 0.929409
[50]	valid_0's rmse: 0.925598
[60]	valid_0's rmse: 0.922172
[70]	valid_0's rmse: 0.919549
[80]	valid_0's rmse: 0.916408
[90]	valid_0's rmse: 0.915306
[100]	valid_0's rmse: 0.91285
[110]	valid_0's rmse: 0.910817
[120]	valid_0's rmse: 0.908783
[130]	valid_0's rmse: 0.906624
[140]	valid_0's rmse: 0.904706
[150]	valid_0's rmse: 0.904508
[160]	valid_0's rmse: 0.902772
[170]	valid_0's rmse: 0.901179
[180]	valid_0's rmse: 0.900762
[190]	valid_0's rmse: 0.900322
[200]	valid_0's rmse: 0.899047
[210]	valid_0's rmse: 0.898536
[220]	valid_0's rmse: 0.897899
[230]	valid_0's rmse: 0.89713
[240]	valid_0's rmse: 0.895594
[250]	valid_0's rmse: 0.894776
[260]	valid_0's rmse: 0.894367
[270]	valid_0's rmse: 0.893427
[280]	valid_0's rmse: 0.893369
[290]	valid_0's rmse: 0.892949
[300]	valid_0's rmse: 0.892677
[310]	valid_0's rms

[('item_cnt_block_lag_1', 707),
 ('item_id_mean_encoding', 350),
 ('shop_cat_mean_encoding', 314),
 ('item_category_id_mean_encoding', 290),
 ('item_cnt_block_lag_3', 204),
 ('shop_category_cnt_block_lag_1', 199),
 ('item_cnt_block_mean_lag_1', 160),
 ('category_cnt_block_lag_1', 153),
 ('item_cnt_block_lag_6', 113),
 ('shop_id_mean_encoding', 94),
 ('shop_cnt_block_lag_1', 85),
 ('shop_cnt_block_mean_lag_1', 79)]

In [57]:
features = [item[0] for item in scores.items() if item[1] > 100]

In [45]:
test.head()

Unnamed: 0,ID,shop_id,item_id,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,item_cnt_block_lag_1,shop_cnt_block_lag_1,category_cnt_block_lag_1,shop_category_cnt_block_lag_1,dbn_scaled
0,0,5,5037,0.547856,-0.278282,0.380675,0.15536,-0.375477,-0.295035,0.012049,3.526361
1,1,5,5320,-0.348508,-0.278282,-0.146701,-0.176311,-0.375477,0.968274,0.240304,3.526361
2,2,5,5233,0.895168,-0.278282,0.380675,0.380896,-0.375477,-0.295035,0.012049,3.526361
3,3,5,5232,0.636626,-0.324261,0.236785,0.19516,-0.375477,-0.385137,-0.135211,3.526361
4,4,5,5268,-0.348508,-0.319524,1.959103,-0.176311,-0.375477,1.26411,0.122495,3.526361


In [38]:
feats  = [
    'item_id_mean_encoding', 'shop_id_mean_encoding', 'item_category_id_mean_encoding',
    'item_cnt_block',
       'shop_cnt_block', 'category_cnt_block',
       'shop_category_cnt_block'
    ]

In [39]:
cols = ['item_id', 'shop_id']

In [40]:
test = test.merge(training[training['date_block_num'] == 33][cols + feats], on=cols, how='left')

In [42]:
test.rename(columns={"item_cnt_block":"item_cnt_block_lag_1", "shop_cnt_block":"shop_cnt_block_lag_1",\
                     "category_cnt_block":"category_cnt_block_lag_1", "shop_category_cnt_block":"shop_category_cnt_block_lag_1"},inplace=True)

In [None]:
len(test)

In [44]:
test['dbn_scaled'] = np.log(34)

In [46]:
preds = lr_model.predict(test[features])
preds.clip(0,20,out=preds)

array([0.85794372, 0.        , 1.17073343, ..., 0.        , 0.        ,
       0.        ])

In [47]:

print(np.mean(preds))
print(np.max(preds))

0.33547950813769933
20.0


In [48]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds

submission.to_csv('submission.csv', index=False)

In [None]:
bestpreds = pd.read_csv('submissionbest.csv')['item_cnt_month']
print(np.mean(bestpreds))
print(np.max(bestpreds))

In [None]:
lr_preds = pd.read_csv('lr111.csv')['item_cnt_month']
lstm_preds = pd.read_csv('lstm104.csv')['item_cnt_month']
#cb_preds = pd.read_csv('cb102.csv')['item_cnt_month']


#preds = np.mean(np.array([lr_preds, lg_preds]),axis=0)

preds = (lstm_preds * 0.65) + (lr_preds * 0.35)