In [11]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [13]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [14]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [15]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [16]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    dbn_combos = list(product(sales.shop_id.unique(), sales.item_id.unique(), [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [17]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [18]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [19]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
127311,20644,22,0.08
121392,19668,31,0.047619
91719,15226,18,1.1
61619,10539,18,0.02
47145,8095,15,0.489796
16174,2920,28,0.022727
127156,20613,32,1.837209
39549,6709,19,0.078431
80707,13553,30,0.046512
104007,16960,24,0.06


In [20]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [21]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [23]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return pd.to_numeric(cumsum/cumcnt, downcast='float')

training['item_me'] = get_mean_encoding(training, ['item_id'], 'item_cnt_block')
training['shop_me'] = get_mean_encoding(training, ['shop_id'], 'item_cnt_block')
training['category_me'] = get_mean_encoding(training, ['item_category_id'], 'item_cnt_block')
training['shop_category_me'] = get_mean_encoding(training, ['shop_id', 'item_category_id'], 'item_cnt_block')
training['shop_item_me'] = get_mean_encoding(training, ['shop_id', 'item_id'], 'item_cnt_block')
training['month_me'] = get_mean_encoding(training, ['month'], 'item_cnt_block')
training['block_me'] = get_mean_encoding(training, ['date_block_num'], 'item_cnt_block')



training.fillna(0,inplace=True)

In [24]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [25]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
shop_block_units 3
shop_block_mean 3
cat_block_units 3
cat_block_mean 3
shop_cat_block_units 3
shop_cat_block_mean 3


In [26]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [27]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

item_block_units 1
item_block_mean 1
shop_block_units 1
shop_block_mean 1
cat_block_units 1
cat_block_mean 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_item_block_units 1
shop_item_block_mean 1


In [28]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [29]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'month_me', 'block_me', 'item_max_units_block',
       'item_min_units_block', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_q0.25', 'shop_minmax_q0.5',
       'shop_minmax_q0.75', 'cat_max_units_block', 'cat_min_units_bl

In [30]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_units,cat_units,shop_units,item_share_of_total_units,category_share_of_total_units,shop_share_of_units,shop_item_units,shop_item_share_of_total_units,shop_item_share_of_shop_units,item_share_of_shop_units,item_me,shop_me,category_me,shop_category_me,shop_item_me,month_me,block_me,item_max_units_block,item_min_units_block,item_minmax_q0.25,item_minmax_q0.5,item_minmax_q0.75,shop_max_units_block,shop_min_units_block,shop_minmax_q0.25,shop_minmax_q0.5,shop_minmax_q0.75,cat_max_units_block,cat_min_units_block,cat_minmax_q0.25,cat_minmax_q0.5,cat_minmax_q0.75,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_cat_minmax_q0.25,shop_cat_minmax_q0.5,shop_cat_minmax_q0.75,shop_item_max_units_block,shop_item_min_units_block,shop_item_minmax_q0.25,shop_item_minmax_q0.5,shop_item_minmax_q0.75,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_item_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_block_units_lag_comp1,item_block_units_lag_comp1
3453549,12206,22,18,0,62,7,6,0.12,1054,0.159431,616,0.133913,6,0.065217,0,0,4209850,2140570050,556626367,201.865479,1727.956909,1977.02356,84197,4.03731,0.015126,0.015126,0.135294,0.253504,0.094121,0.060194,0.0,0.285945,0.290972,18,2,6.0,10.0,14.0,2197,944,1257.25,1570.5,1883.75,1324,73,385.75,698.5,1011.25,42811401,18,0,4.5,9.0,13.5,1,0,0.25,0.5,0.75,11.0,0.22449,1327.666626,0.19763,644.666687,0.151037,3.333333,0.038619,0.0,12,0.244898,1163,0.174076,642,0.154142,0,0.0,0,0.0,17.591892,0.181516
5981223,20391,5,18,0,72,7,4,0.08,1290,0.195129,1649,0.227448,20,0.137931,0,0,4209850,2140570050,556626367,201.865479,1727.956909,1977.02356,84197,4.03731,0.015126,0.015126,0.1,0.194829,0.183988,0.12759,0.333333,0.256405,0.254291,21,1,6.0,11.0,16.0,1953,875,1144.5,1414.0,1683.5,3970,1122,1834.0,2546.0,3258.0,42811401,55,11,22.0,33.0,44.0,1,0,0.25,0.5,0.75,3.0,0.061224,1153.333374,0.171895,1878.0,0.25774,28.333334,0.190273,0.0,4,0.081633,1286,0.192486,1790,0.228316,24,0.15,0,0.0,19.452427,0.060505
988960,3676,41,24,1,23,1,86,1.72,1309,0.213227,5537,0.651412,88,0.517647,1,1,4871650,1709982800,598141187,233.599289,1675.665649,1908.221191,97433,4.671986,0.016289,0.016289,1.380368,0.251869,0.622957,0.490285,1.25,0.399613,0.372121,166,12,50.5,89.0,127.5,1936,634,959.5,1285.0,1610.5,9520,2182,4016.5,5851.0,7685.5,34199656,166,31,64.75,98.5,132.25,4,0,1.0,2.0,3.0,100.666664,2.003333,1209.0,0.187588,6221.666504,0.76272,78.333336,0.481344,1.0,166,3.32,1577,0.238759,9520,1.12,134,0.788235,2,2.0,25.688223,2.704023
3922400,13888,31,30,2,55,7,20,0.465116,5392,1.012963,6475,0.216664,599,0.861871,2,2,2551405,795165761,315840205,122.341789,1058.352173,728.491516,59335,2.845158,0.018786,0.018786,1.74946,1.121119,0.232753,1.006835,8.4375,0.277544,0.272633,229,13,67.0,121.0,175.0,11884,5124,6814.0,8504.0,10194.0,13694,4917,7111.25,9305.5,11499.75,18492227,1413,522,744.75,967.5,1190.25,20,2,6.5,11.0,15.5,16.0,0.357091,5563.333496,1.044428,6509.333496,0.209156,710.333313,1.018909,2.333333,15,0.348837,5507,1.05579,6017,0.207304,647,0.958519,3,3.0,103.456696,0.281796
1342100,4859,24,21,0,76,10,3,0.057692,1407,0.222135,226,0.062988,0,0.0,0,0,4801004,1766935664,584799218,230.211761,287.660034,1268.463745,92327,4.427149,0.015788,0.015788,0.0,0.409206,0.063201,0.0,0.0,0.313799,0.349945,3,1,1.5,2.0,2.5,2536,878,1292.5,1707.0,2121.5,265,81,127.0,173.0,219.0,33979532,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1489.666626,0.234606,210.666672,0.061174,0.0,0.0,0.0,0,0.0,1355,0.223192,221,0.062254,0,0.0,0,0.0,21.392485,0.0
742990,2876,45,31,0,19,8,17,0.404762,710,0.138998,4665,0.67316,32,0.193939,0,0,2594928,723681714,315592672,124.428749,1749.576538,716.622131,61784,2.962589,0.019577,0.019577,0.859599,0.199464,0.510643,0.278323,0.125,0.347891,0.338286,180,10,52.5,95.0,137.5,1487,623,839.0,1055.0,1271.0,9795,2435,4275.0,6115.0,7955.0,17230517,125,18,44.75,71.5,98.25,1,0,0.25,0.5,0.75,16.666666,0.384426,675.0,0.127953,3349.0,0.463932,28.333334,0.170046,0.0,19,0.44186,675,0.126808,3178,0.447921,18,0.109091,0,0.0,13.214565,0.371966
1566423,5602,40,20,0,5,9,4,0.08,345,0.056828,156,0.222857,0,0.0,0,0,4133050,1558991550,501834931,198.182861,613.929016,1409.202026,82661,3.963657,0.016472,0.016472,0.138983,0.054043,0.241003,0.0,0.0,0.351679,0.347947,13,1,4.0,7.0,10.0,874,291,436.75,582.5,728.25,279,32,93.75,155.5,217.25,31179831,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,5.666667,0.115238,319.666656,0.048832,191.333328,0.267078,0.0,0.0,0.0,3,0.06,325,0.051077,201,0.262745,0,0.0,0,0.0,5.353319,0.049415
3637170,12826,37,27,0,40,4,1,0.021277,858,0.156684,10371,0.22655,70,0.071869,0,0,2939850,1040870037,342523800,140.968018,483.242218,2007.98999,62550,2.99932,0.018262,0.018262,0.329588,0.201899,0.240009,0.051277,0.714286,0.275261,0.266979,37,1,10.0,19.0,28.0,1760,758,1008.5,1259.0,1509.5,21552,6766,10462.5,14159.0,17855.5,22146171,131,18,46.25,74.5,102.75,2,0,0.5,1.0,1.5,15.333333,0.320432,977.333313,0.160388,13217.666992,0.263589,64.666664,0.061707,0.666667,12,0.26087,929,0.152746,12861,0.249631,62,0.055357,1,1.0,16.964939,0.219138
1514298,5433,37,12,0,55,1,16,0.347826,1059,0.140246,10216,0.204124,175,0.160846,0,0,4974762,3086913306,816617997,238.543579,1797.50708,27.533838,108147,5.18573,0.013243,0.013243,0.204276,0.25849,0.252088,0.232687,0.0,0.42462,0.453498,26,1,7.25,13.5,19.75,1760,758,1008.5,1259.0,1509.5,13694,4917,7111.25,9305.5,11499.75,67106811,291,89,139.5,190.0,240.5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0
4876424,16752,18,17,0,40,6,2,0.040816,1214,0.181709,15850,0.217825,80,0.053872,0,0,4386235,2257954349,598049715,210.323273,1178.214966,1903.835083,89515,4.292312,0.014968,0.014968,0.018519,0.241969,0.238475,0.083509,0.0,0.286797,0.297224,2,1,1.25,1.5,1.75,1902,977,1208.25,1439.5,1670.75,21552,6766,10462.5,14159.0,17855.5,46080701,168,50,79.5,109.0,138.5,0,0,0.0,0.0,0.0,1.5,0.031943,1423.0,0.206674,17413.666016,0.224149,126.333336,0.078896,0.0,1,0.020408,1395,0.208676,14765,0.195033,119,0.077023,0,0.0,20.880108,0.014968


In [109]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean', 'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'month_me', 'block_me', 'item_max_units_block',
       'item_min_units_block', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_q0.25', 'shop_minmax_q0.5',
       'shop_minmax_q0.75', 'cat_max_units_block', 'cat_min_units_bl

In [176]:
gc.collect()

ZEROS_KEEP=0.2


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 31471
zeros_keep_indices_val 157355
non_zeros_val_indices 31471


In [110]:
features = [
    
    'item_units',
       'cat_units', 'shop_units', 'item_share_of_total_units',
       'category_share_of_total_units', 'shop_share_of_units',
       'shop_item_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_share_of_shop_units',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'month_me', 'block_me', 'item_max_units_block',
       'item_min_units_block', 'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_q0.25', 'shop_minmax_q0.5',
       'shop_minmax_q0.75', 'cat_max_units_block', 'cat_min_units_block',
       'cat_minmax_q0.25', 'cat_minmax_q0.5', 'cat_minmax_q0.75',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_q0.25',
       'shop_cat_minmax_q0.5', 'shop_cat_minmax_q0.75',
       'shop_item_max_units_block', 'shop_item_min_units_block',
       'shop_item_minmax_q0.25', 'shop_item_minmax_q0.5',
       'shop_item_minmax_q0.75', 'item_block_units_rolling_3',
       'item_block_mean_rolling_3', 'shop_block_units_rolling_3',
       'shop_block_mean_rolling_3', 'cat_block_units_rolling_3',
       'cat_block_mean_rolling_3', 'shop_cat_block_units_rolling_3',
       'shop_cat_block_mean_rolling_3', 'shop_item_block_mean_rolling_3',
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'shop_block_units_lag_comp1', 'item_block_units_lag_comp1'


]

In [184]:

features = [
    
    
        #'item_id', 'shop_id',
       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
           'shop_cat_block_mean_rolling_3',



      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
            'shop_cat_block_mean_lag_1',

    
    
]




In [185]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 1,
                             bagging_temperature = 30,
                             #cat_features=[0,1],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.2136985	test: 1.3004631	best: 1.3004631 (0)	total: 43.5ms	remaining: 43.4s
1:	learn: 1.2044845	test: 1.2921561	best: 1.2921561 (1)	total: 83.6ms	remaining: 41.7s
2:	learn: 1.1984748	test: 1.2863015	best: 1.2863015 (2)	total: 129ms	remaining: 42.7s
3:	learn: 1.1925722	test: 1.2804494	best: 1.2804494 (3)	total: 168ms	remaining: 42s
4:	learn: 1.1845213	test: 1.2735550	best: 1.2735550 (4)	total: 209ms	remaining: 41.6s
5:	learn: 1.1783082	test: 1.2676347	best: 1.2676347 (5)	total: 250ms	remaining: 41.4s
6:	learn: 1.1717676	test: 1.2613744	best: 1.2613744 (6)	total: 292ms	remaining: 41.4s
7:	learn: 1.1640610	test: 1.2544275	best: 1.2544275 (7)	total: 334ms	remaining: 41.4s
8:	learn: 1.1571789	test: 1.2486443	best: 1.2486443 (8)	total: 374ms	remaining: 41.2s
9:	learn: 1.1518092	test: 1.2430622	best: 1.2430622 (9)	total: 412ms	remaining: 40.7s
10:	learn: 1.1442453	test: 1.2368471	best: 1.2368471 (10)	total: 452ms	remaining: 40.6s
11:	learn: 1.1388405	test: 1.2317271	best: 1.2317271

94:	learn: 0.9931994	test: 1.1015240	best: 1.1015240 (94)	total: 3.9s	remaining: 37.2s
95:	learn: 0.9931201	test: 1.1014138	best: 1.1014138 (95)	total: 3.95s	remaining: 37.2s
96:	learn: 0.9924847	test: 1.1006185	best: 1.1006185 (96)	total: 3.99s	remaining: 37.1s
97:	learn: 0.9918066	test: 1.0999907	best: 1.0999907 (97)	total: 4.03s	remaining: 37.1s
98:	learn: 0.9916690	test: 1.0997054	best: 1.0997054 (98)	total: 4.07s	remaining: 37.1s
99:	learn: 0.9909840	test: 1.0985376	best: 1.0985376 (99)	total: 4.12s	remaining: 37s
100:	learn: 0.9905314	test: 1.0981088	best: 1.0981088 (100)	total: 4.16s	remaining: 37s
101:	learn: 0.9901392	test: 1.0977632	best: 1.0977632 (101)	total: 4.21s	remaining: 37s
102:	learn: 0.9893021	test: 1.0971508	best: 1.0971508 (102)	total: 4.25s	remaining: 37s
103:	learn: 0.9891081	test: 1.0969946	best: 1.0969946 (103)	total: 4.29s	remaining: 37s
104:	learn: 0.9885168	test: 1.0966544	best: 1.0966544 (104)	total: 4.33s	remaining: 36.9s
105:	learn: 0.9880221	test: 1.096

[('item_block_mean_lag_1', 51.78539761290427),
 ('shop_cat_block_mean_rolling_3', 13.10754419308898),
 ('item_block_mean_rolling_3', 12.89459451844027),
 ('shop_cat_block_mean_lag_1', 12.81663564840732),
 ('shop_block_mean_lag_1', 5.901300923977381),
 ('shop_block_mean_rolling_3', 3.494527103181757)]

In [48]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [189]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [235]:
item_features = [ 
    'item_slope'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [106]:
shop_features = [
        'shop_me'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [190]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')

item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]


In [191]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')

item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [192]:
test.fillna(0, inplace=True)

In [193]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,item_block_mean_rolling_3,shop_block_mean_rolling_3,shop_cat_block_mean_rolling_3,item_block_mean_lag_1,shop_block_mean_lag_1,shop_cat_block_mean_lag_1
136840,13888,21908,2,55,0.342117,0.158264,0.006601,0.302326,0.157915,0.006211
174167,17637,182616,38,61,0.037571,0.254023,0.101308,0.023256,0.250344,0.048
65936,6674,194786,41,12,1.158915,0.146656,0.333333,0.976744,0.12999,0.166667
2728,454,208983,39,45,0.0,0.156321,0.0,0.0,0.155359,0.0
140436,14284,156442,55,55,0.046713,0.28841,0.0,0.023256,0.298722,0.0
179893,18277,39766,12,55,0.047074,0.273826,0.066372,0.023256,0.328614,0.051242
200770,20612,51488,26,72,0.480461,0.231153,0.095498,0.325581,0.238348,0.132979
117844,12524,176644,35,55,0.124811,0.284192,0.250751,0.046512,0.291642,0.237578
79160,8226,165108,36,55,0.078274,0.0,0.0,0.046512,0.0,0.0
66475,6727,159088,56,19,0.076295,0.247908,0.405853,0.023256,0.227139,0.297468


In [194]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

array([0.04637926, 0.02966609, 0.1001989 , ..., 0.16857195, 0.16967997,
       0.16301134])

In [196]:
np.max(cb_preds)

9.282098270610277

In [195]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = cb_preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)

In [205]:
training['shop_me_real']= training.groupby('shop_id')['shop_me'].transform(np.mean)
#training['item_me_real2']= training.groupby('item_id')['item_me'].transform('last')

In [178]:
training.groupby('item_id')['item_me'].transform('last')

0          0.020408
1          0.020408
2          0.020408
3          0.020408
4          0.020408
5          0.020408
6          0.020408
7          0.020408
8          0.020408
9          0.020408
10         0.020408
11         0.020408
12         0.020408
13         0.020408
14         0.020408
15         0.020408
16         0.020408
17         0.020408
18         0.020408
19         0.020408
20         0.020408
21         0.020408
22         0.020408
23         0.020408
24         0.020408
25         0.020408
26         0.020408
27         0.020408
28         0.020408
29         0.020408
             ...   
6425064    0.021277
6425065    0.021277
6425066    0.021277
6425067    0.021277
6425068    0.021277
6425069    0.021277
6425070    0.021277
6425071    0.021277
6425072    0.021277
6425073    0.021277
6425074    0.021277
6425075    0.021277
6425076    0.021277
6425077    0.021277
6425078    0.021277
6425079    0.021277
6425080    0.021277
6425081    0.021277
6425082    0.021277


In [73]:
best_scores = []
for dbn in [31,32,33]:
    
    gc.collect()

    ZEROS_KEEP=0.3


    #x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
    x_train = training[(training['date_block_num'] < dbn)]
    y_train = x_train['item_cnt_block']





    x_val = training[training['date_block_num'] == dbn]
    y_val = x_val['item_cnt_block']

    pos_val_len = len(y_val[y_val != 0])
    print("pos_val_len", pos_val_len)

    zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
    print("zeros_keep_indices_val", len(zeros_keep_indices_val))
    non_zeros_val_indices = y_val[y_val != 0].index
    print("non_zeros_val_indices", len(non_zeros_val_indices))

    val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

    y_val = y_val.loc[val_indices]
    x_val = x_val.loc[val_indices]

    tr_data = xgb.DMatrix(x_train[features], y_train)
    va_data = xgb.DMatrix(x_val[features], y_val)


    watchlist = [(tr_data, 'train'), (va_data, 'valid')]

    xg_model = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 1, verbose_eval=100)
    best_scores.append(xg_model.best_score)
    
print("scores mean", np.mean(best_scores))

pos_val_len 33426
zeros_keep_indices_val 111420
non_zeros_val_indices 33426
[0]	train-rmse:1.20852	valid-rmse:1.08032
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 1 rounds.
[100]	train-rmse:1.03015	valid-rmse:0.921817
[200]	train-rmse:0.99933	valid-rmse:0.890698
[300]	train-rmse:0.992418	valid-rmse:0.88269
[400]	train-rmse:0.989436	valid-rmse:0.879433
Stopping. Best iteration:
[453]	train-rmse:0.988453	valid-rmse:0.8782

pos_val_len 29603
zeros_keep_indices_val 98676
non_zeros_val_indices 29603
[0]	train-rmse:1.20414	valid-rmse:1.17493
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 1 rounds.
[100]	train-rmse:1.02636	valid-rmse:1.02754
[200]	train-rmse:0.99592	valid-rmse:1.00653
Stopping. Best iteration:
[209]	train-rmse:0.994906	valid-rmse:1.0061

pos_val_len 31471
zeros_keep_indices_val 104903
non_zeros_val_indi

In [120]:


gc.collect()
params =   {
    'objective' : 'reg:linear',
    'tree_method':'gpu_hist',
    #'gpu_id': 0,
    'learning_rate': 0.1, 
    'early_stopping_rounds': 10,
    #'maximize': False,
    'verbose_eval': 10,
    'gamma' : 100, 
    #'min_child_weight' : 300,
    #'nthread' : 16,
    #'max_depth' : 1,
    #'subsample' : 0.1, 
    'colsample_bytree' : 0.1, 
    'seed':42, 
    'eval_metric' : "rmse",
    #'n_estimators':999,
    #'max_leaves': 300,
}


tr_data = xgb.DMatrix(x_train[features], y_train)
va_data = xgb.DMatrix(x_val[features], y_val)


watchlist = [(tr_data, 'train'), (va_data, 'valid')]


xg_model = xgb.train(params=params, dtrain=tr_data, evals=watchlist,num_boost_round=10)

scores = xg_model.get_score(importance_type='gain')


sorted(scores.items(), key=lambda x: x[1])[::-1]

[11:24:04] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[11:24:09] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 2 pruned nodes, max_depth=6
[0]	train-rmse:1.15232	valid-rmse:1.12377
[11:24:12] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 96 extra nodes, 8 pruned nodes, max_depth=6
[1]	train-rmse:1.13379	valid-rmse:1.10341
[11:24:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 16 pruned nodes, max_depth=6
[2]	train-rmse:1.09021	valid-rmse:1.02311
[11:24:15] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 6 pruned nodes, max_depth=6
[3]	train-rmse:1.03789	valid-rmse:0.955366
[11:24:17] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 4 pruned nodes, max_depth=6
[4]	train-rmse:1.01415	valid-rmse:0.93

[('shop_item_minmax_q0.25', 243752.84536090912),
 ('shop_item_max_units_block', 224456.7840839285),
 ('shop_item_me', 103687.72130070001),
 ('item_block_units_lag_comp1', 81892.4914217143),
 ('shop_item_minmax_q0.5', 81399.59552060712),
 ('shop_item_block_units_lag_1', 64842.992719600006),
 ('shop_item_block_mean_lag_1', 47216.49438),
 ('shop_cat_block_mean_rolling_3', 45725.194193652176),
 ('item_minmax_q0.5', 37874.82671552632),
 ('item_block_units_lag_1', 33958.31436168181),
 ('item_share_of_total_units', 31573.05780166667),
 ('shop_cat_block_mean_lag_1', 24447.053048681817),
 ('item_me', 22817.90706673334),
 ('shop_minmax_q0.5', 22375.611236642853),
 ('shop_block_units_lag_comp1', 20514.27399871429),
 ('shop_me', 19124.499382857142),
 ('item_max_units_block', 18777.234678700002),
 ('item_min_units_block', 16156.658968275862),
 ('cat_minmax_q0.75', 12404.744194),
 ('shop_cat_minmax_q0.25', 10525.536580225806),
 ('cat_block_mean_rolling_3', 9184.862551333332),
 ('item_minmax_q0.75', 