In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'].isin([2013]) == False]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [11]:
training["shop_cat"] = training["shop_id"].astype(str) + "_" + training["item_category_id"].astype(str)
training["shop_item"] = training["shop_id"].astype(str) + "_" + training["item_id"].astype(str)

In [35]:
unique_shop_cats = training['shop_cat'].unique()
shop_cats = dict(list(zip(unique_shop_cats, range(1,len(unique_shop_cats)))))

def get_shop_cat_int(x):
    if x in shop_cats:
        return shop_cats[x]

training['shop_cat_int'] = training['shop_cat'].apply(lambda x: get_shop_cat_int(x))

In [36]:
unique_shop_items = training['shop_item'].unique()
shop_items = dict(list(zip(unique_shop_items, range(1,len(unique_shop_items)))))

def get_shop_item_int(x):
    if x in shop_items:
        return shop_items[x]

training['shop_item_int'] = training['shop_item'].apply(lambda x: get_shop_item_int(x))

In [12]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [13]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [14]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
shop_block_units 3
shop_block_mean 3
cat_block_units 3
cat_block_mean 3
shop_cat_block_units 3
shop_cat_block_mean 3


In [64]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [15]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

item_block_units 1
item_block_mean 1
shop_block_units 1
shop_block_mean 1
cat_block_units 1
cat_block_mean 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_item_block_units 1
shop_item_block_mean 1


In [68]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,shop_cat,shop_item,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,shop_cat_mean_encoding,shop_item_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_units,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_units,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_units,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_units,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_item_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,shop_item_share_of_shop_units_mean
1799840,15165,53,26,0,69,3,53_69,53_15165,0.140575,0.426174,0.32641,0.509234,0.186667,0.142857,0.43619,14,0.304348,1072,0.368638,774,0.420652,12,0.3,0,0,2698590,14,3,5.75,8.5,11.25,170597820,1792,771,1026.25,1281.5,1536.75,444091268,1237,104,387.25,670.5,953.75,9654158,23,0,5.75,11.5,17.25,58665,1,0,0.25,0.5,0.75,0.0,0.0,1381.666626,0.507151,975.333313,0.637568,11.666667,0.322092,0.0,10,0.212766,1061,0.385678,1237,0.598162,23,0.522727,0,0.0,129.399414,699.819092,2001.886475,2.813031,0.034388,0.034388,0.031308
242845,2354,50,26,0,30,3,50_30,50_2354,0.488665,0.382974,1.520293,0.511849,1.140796,0.3125,0.440278,4,0.086957,773,0.265818,6558,1.218506,61,0.521368,0,0,2698590,260,1,65.75,130.5,195.25,170597820,1554,665,887.25,1109.5,1331.75,444091268,10108,3862,5423.5,6985.0,8546.5,9654158,200,49,86.75,124.5,162.25,58665,4,0,1.0,2.0,3.0,11.0,0.223404,1159.333374,0.425639,8483.666992,1.565987,106.666664,0.969368,0.333333,8,0.170213,828,0.300981,7122,1.317669,80,0.695652,1,1.0,129.399414,699.819092,2001.886475,2.813031,0.034388,0.034388,0.042499
447813,3818,18,19,0,55,8,18_55,18_3818,0.09125,0.372742,0.314535,0.457326,0.243711,0.0,0.555919,7,0.137255,948,0.471642,8409,0.396352,128,0.307692,0,0,2909805,10,1,3.25,5.5,7.75,114680550,1480,606,824.5,1043.0,1261.5,378932448,11613,4757,6471.0,8185.0,9899.0,7430048,178,57,87.25,117.5,147.75,57055,0,0,0.0,0.0,0.0,7.333333,0.148844,714.666687,0.390451,7339.666504,0.383909,80.333336,0.207571,0.0,6,0.12,724,0.375714,8081,0.403042,86,0.214464,0,0.0,139.527344,1694.342896,1380.080444,2.73583,0.049751,0.049751,0.044209
1085979,9171,57,31,0,43,8,57_43,57_9171,0.15783,0.818979,0.102133,0.457967,0.054545,0.3125,0.389078,1,0.02381,2382,0.689236,122,0.058095,0,0.0,0,0,2373336,13,1,4.0,7.0,10.0,195291648,4070,1645,2251.25,2857.5,3463.75,435573768,426,122,198.0,274.0,350.0,10370804,13,0,3.25,6.5,9.75,56508,2,0,0.5,1.0,1.5,2.666667,0.061311,1880.666626,0.594929,176.0,0.065878,0.0,0.0,0.0,1,0.023256,1874,0.565821,163,0.063178,0,0.0,0,0.0,113.803246,291.39856,1126.504883,2.709601,0.028935,0.028935,0.043519
414867,3605,5,25,0,55,2,5_55,5_3605,0.225806,0.332942,0.313347,0.514203,0.245208,0.222222,0.457663,8,0.170213,806,0.292984,6820,0.294932,103,0.20935,0,0,2779674,26,3,8.75,14.5,20.25,162699642,1418,530,752.0,974.0,1196.0,396919512,11613,4757,6471.0,8185.0,9899.0,8445096,167,75,98.0,121.0,144.0,59142,1,0,0.25,0.5,0.75,11.0,0.22,1091.666626,0.412823,9203.666992,0.383218,132.666672,0.275767,0.333333,7,0.14,960,0.353852,7942,0.325492,127,0.260246,0,0.0,133.28746,497.367035,1623.162842,2.835903,0.03635,0.03635,0.043519
2276759,19034,19,14,0,37,3,19_37,19_19034,0.168077,0.474167,0.280862,0.511359,0.250489,0.1875,0.636405,5,0.104167,1029,0.634793,1920,0.285714,36,0.257143,0,0,2364096,20,2,6.5,11.0,15.5,79837492,2312,808,1184.0,1560.0,1936.0,236452752,3796,1376,1981.0,2586.0,3191.0,4926099,87,25,40.5,56.0,71.5,49252,1,0,0.25,0.5,0.75,9.5,0.206522,1008.5,0.651709,1411.5,0.234634,29.5,0.225284,1.0,13,0.282609,977,0.621897,1376,0.221578,30,0.222222,1,1.0,113.360184,1040.741699,1768.798706,2.36167,0.06169,0.06169,0.043519
597759,4840,7,33,2,20,10,7_20,7_4840,6.445946,0.452889,1.778045,0.40129,1.706358,5.0,0.331046,80,1.818182,1149,0.287898,5664,1.214408,130,1.226415,2,2,2573868,461,80,175.25,270.5,365.75,233461527,2261,770,1142.75,1515.5,1888.25,530591380,8130,1407,3087.75,4768.5,6449.25,12058895,173,15,54.5,94.0,133.5,58497,5,2,2.75,3.5,4.25,0.0,0.0,1223.0,0.3531,4612.333496,1.182831,106.0,1.154896,0.0,461,10.72093,1198,0.329755,5733,1.320055,136,1.346535,5,5.0,123.418907,728.626282,897.310242,2.804975,0.025056,0.025056,0.026277
1001675,8416,56,21,0,37,10,56_37,56_8416,0.515203,0.453587,0.27779,0.40129,0.20165,0.0,0.504295,9,0.173077,1101,0.477865,2423,0.279019,34,0.203593,0,0,3112980,126,4,34.5,65.0,95.5,137928960,1800,691,968.25,1245.5,1522.75,382022004,3796,1376,1981.0,2586.0,3191.0,7346577,71,12,26.75,41.5,56.25,59865,1,0,0.25,0.5,0.75,13.333333,0.264706,1194.333374,0.594082,2232.333252,0.260857,43.0,0.252571,0.0,8,0.16,944,0.445073,2161,0.252749,31,0.181287,0,0.0,149.26973,1842.489502,435.390015,2.870572,0.043403,0.043403,0.042919
2310151,19310,53,31,0,54,8,53_54,53_19310,0.037448,0.426174,0.17555,0.457967,0.0,0.0,0.389078,5,0.119048,1185,0.342882,227,0.168899,0,0.0,0,0,2373336,5,1,2.0,3.0,4.0,195291648,1792,771,1026.25,1281.5,1536.75,435573768,304,128,172.0,216.0,260.0,10370804,0,0,0.0,0.0,0.0,56508,0,0,0.0,0.0,0.0,2.666667,0.058321,964.333313,0.304407,219.333328,0.195076,0.0,0.0,0.0,2,0.045455,1065,0.321558,228,0.189369,0,0.0,0,0.0,113.803246,291.39856,1126.504883,2.709601,0.028935,0.028935,0.044358
1735383,14543,56,21,0,37,10,56_37,56_14543,0.027972,0.446969,0.280035,0.398059,0.212801,0.0,0.493408,1,0.019231,1101,0.477865,2423,0.279019,34,0.203593,0,0,3112980,4,1,1.75,2.5,3.25,137928960,1800,691,968.25,1245.5,1522.75,382022004,3796,1376,1981.0,2586.0,3191.0,7346577,71,12,26.75,41.5,56.25,59865,0,0,0.0,0.0,0.0,2.666667,0.053344,1194.333374,0.594082,2232.333252,0.260857,43.0,0.252571,0.0,4,0.08,944,0.445073,2161,0.252749,31,0.181287,0,0.0,149.26973,1842.489502,435.390015,2.870572,0.043403,0.043403,0.041015


In [16]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'shop_cat', 'shop_item',
       'item_id_mean_encoding', 'shop_id_mean_encoding',
       'item_category_id_mean_encoding', 'month_mean_encoding',
       'shop_cat_mean_encoding', 'shop_item_mean_encoding',
       'date_block_num_mean_encoding', 'item_block_units',
       'item_block_mean', 'shop_block_units', 'shop_block_mean',
       'cat_block_units', 'cat_block_mean', 'shop_cat_block_units',
       'shop_cat_block_mean', 'shop_item_block_units',
       'shop_item_block_mean', 'item_block_units_rolling_3',
       'item_block_mean_rolling_3', 'shop_block_units_rolling_3',
       'shop_block_mean_rolling_3', 'cat_block_units_rolling_3',
       'cat_block_mean_rolling_3', 'shop_cat_block_units_rolling_3',
       'shop_cat_block_mean_rolling_3', 'item_block_units_lag_1',
       'item_block_mean_lag_1', 'shop_block_units_lag_1',
       'shop_block_mean_lag_1', 'cat_block_units_lag_1',
    

In [19]:
training.fillna(0,inplace=True)
training = training.sample(frac=1).reset_index(drop=True)


In [47]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

#y_val = y_val.loc[val_indices]
#x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 146010
non_zeros_val_indices 29202


In [91]:
features = [
    
'item_id',
#    'shop_id', 
 #      'item_category_id', 'month', 'shop_cat_int', 
   # 'shop_item_int',
       'item_id_mean_encoding', 
   #'shop_id_mean_encoding',
       #'item_category_id_mean_encoding', 
    #'month_mean_encoding',
     #  'shop_cat_mean_encoding', 
    'shop_item_mean_encoding',
    #   'date_block_num_mean_encoding', 
    'item_block_units_rolling_3',
       'item_block_mean_rolling_3', 
#    'shop_block_units_rolling_3',
  #     'shop_block_mean_rolling_3', 'cat_block_units_rolling_3',
   #    'cat_block_mean_rolling_3', 'shop_cat_block_units_rolling_3',
    #   'shop_cat_block_mean_rolling_3', 
    'item_block_units_lag_1',
       #'item_block_mean_lag_1', 
  #  'shop_block_units_lag_1',
      # 'shop_block_mean_lag_1', 'cat_block_units_lag_1',
       #'cat_block_mean_lag_1', 'shop_cat_block_units_lag_1',
   #    'shop_cat_block_mean_lag_1', 
    'shop_item_block_units_lag_1',
       #'shop_item_block_mean_lag_1'
]


gc.collect()
lgtrain = lgbm.Dataset(x_train[features], label=y_train)
lgval = lgbm.Dataset(x_val[features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 16,
        #"device": "gpu",
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        #"max_bin": 10,#default 255
        #"num_leaves": 10, #default 31
        #"bagging_fraction": 0.5,
        #"bagging_freq": 1,
        "min_data_in_leaf": 1000,
        #"feature_fraction": 0.5,
        #"lambda_l2": 10000,
        #"max_depth": 3, #default no limit
        #"min_gain_to_split": 10000,
        "learning_rate" : 0.01,
        #"histogram_pool_size": 1000,
        
}

evals_result = {}
lg_model = lgbm.train(params, lgtrain, 20000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=1, 
                      verbose_eval=10, 
                      evals_result=evals_result,
                      #categorical_feature= [0,1,2,3,4,5]
                    categorical_feature= [0]
                     )

scores = {}
for i,score in enumerate(lg_model.feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]




Training until validation scores don't improve for 1 rounds.
[10]	valid_0's rmse: 1.25948
[20]	valid_0's rmse: 1.22606
[30]	valid_0's rmse: 1.19945
[40]	valid_0's rmse: 1.17791
[50]	valid_0's rmse: 1.16121
[60]	valid_0's rmse: 1.14828
[70]	valid_0's rmse: 1.13797
[80]	valid_0's rmse: 1.12986
[90]	valid_0's rmse: 1.12339
[100]	valid_0's rmse: 1.1179
[110]	valid_0's rmse: 1.11381
[120]	valid_0's rmse: 1.11014
[130]	valid_0's rmse: 1.10755
Early stopping, best iteration is:
[134]	valid_0's rmse: 1.1068


[('item_id', 1363),
 ('shop_item_mean_encoding', 777),
 ('shop_item_block_units_lag_1', 615),
 ('item_id_mean_encoding', 426),
 ('item_block_units_lag_1', 416),
 ('item_block_mean_rolling_3', 245),
 ('item_block_units_rolling_3', 178)]

In [None]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [119]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression


lr_model = Ridge(alpha=0.5)
lr_model.fit(x_train[features], y_train)

from sklearn.metrics import mean_squared_error
from math import sqrt

lr_val_preds = lr_model.predict(x_val[features])
lr_val_preds.clip(0,20,out=lr_val_preds)
rms = sqrt(mean_squared_error(y_val, lr_val_preds))
print("rmse: ", rms)

rmse:  1.1986273086884232


In [99]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [100]:
item_features = [ 
    'item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [101]:
shop_features = [
        'shop_id_mean_encoding'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [102]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [103]:
shop_item_features = [
        'shop_item_mean_encoding'
]

merge_col = ['shop_id', 'item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [114]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
item_block_units 3
[['item_id', 'item_block_units_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]


In [112]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [113]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
item_block_units 1
shop_block_mean 1
shop_cat_block_mean 1


In [109]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')

shop_item_block_mean 1
shop_item_block_units 1


In [115]:
test.fillna(0, inplace=True)

In [111]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,shop_item_mean_encoding,item_block_mean_rolling_3,shop_block_mean_rolling_3,shop_cat_block_mean_rolling_3,item_block_mean_lag_1,shop_block_mean_lag_1,shop_cat_block_mean_lag_1,shop_item_block_mean_lag_1,shop_item_block_units_lag_1
154986,15571,30857,10,55,0.648305,0.0,0.314389,0.0,0.482978,0.0,0.0,0.372093,0.0,0.0,0.0,0.0
36279,3777,170811,37,55,0.196049,0.293737,0.314389,0.153846,0.062208,0.252457,0.190057,0.116279,0.249656,0.166667,0.0,0.0
128832,13318,96003,14,47,0.147059,0.303452,0.35911,0.0,0.114429,0.249964,0.258981,0.069767,0.252684,0.106796,0.0,0.0
118348,12549,174130,35,55,0.187424,0.47437,0.314389,0.071429,0.101346,0.385637,0.276858,0.139535,0.385081,0.255208,0.0,0.0
138971,14197,182959,38,31,0.043357,0.418532,0.075776,0.0,0.061663,0.347569,0.0,0.069767,0.339389,0.0,0.0,0.0
201711,20653,141850,57,72,0.239362,0.817211,0.428157,0.272727,0.093057,0.599016,0.192694,0.093023,0.570603,0.208955,0.0,0.0
98493,10348,16462,3,40,0.329588,0.227226,0.40485,0.0,0.380164,0.164302,0.108786,0.372093,0.165428,0.071146,0.0,0.0
198180,20400,122880,52,72,0.0,0.324313,0.428157,0.0,0.0,0.244885,0.121229,0.0,0.279383,0.186567,0.0,0.0
88212,9210,64433,22,61,0.163265,0.370099,0.198886,0.0,0.152413,0.290314,0.043503,0.023256,0.311313,0.02439,0.0,0.0
135160,13706,20738,2,69,0.296919,0.252488,0.332432,0.285714,0.23265,0.213751,0.149972,0.186047,0.210845,0.072464,0.0,0.0


In [116]:
lg_preds = lg_model.predict(test[features])
lg_preds.clip(0,20,out=lg_preds)

array([ 0.2762435 ,  0.18748113,  0.30796519, ...,  0.31227997,
        0.30550796,  0.29385306])

In [120]:
lr_preds = lr_model.predict(test[features])
lr_preds.clip(0,20,out=lr_preds)

array([ 0.21195681,  0.08785841,  0.52085958, ...,  0.24051506,
        0.2303724 ,  0.20173431])

In [117]:
print(np.mean(lg_preds))
print(np.max(lg_preds))

0.363807105527
13.8968167913


In [125]:
np.mean(lg_preds.conc lr_preds, axis=1)

TypeError: mean() got multiple values for argument 'axis'

In [133]:
preds = np.mean(np.array([lg_preds, lr_preds]),axis=0)

In [134]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds

submission.to_csv('submission.csv', index=False)