In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [99]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')


In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [6]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    dbn_combos = list(product(sales.shop_id.unique(), sales.item_id.unique(), [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [7]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [42]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [8]:
cols = ['item_id','date_block_num']

training.groupby(cols,as_index=False)['item_cnt_block'].mean().sample(10)

Unnamed: 0,item_id,date_block_num,item_cnt_block
60239,10342,14,0.020833
70090,11776,15,0.387755
84154,14124,31,0.238095
30374,5164,22,0.08
33659,5666,29,1.348837
50160,8555,17,0.163265
57783,9986,20,1.06
128324,20812,29,0.069767
64286,10908,21,0.211538
21391,3745,12,1.086957


In [9]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [21]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [22]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [56]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return pd.to_numeric(cumsum/cumcnt, downcast='float')

training['item_me'] = get_mean_encoding(training, ['item_id'], 'item_cnt_block')
training['shop_me'] = get_mean_encoding(training, ['shop_id'], 'item_cnt_block')
training['category_me'] = get_mean_encoding(training, ['item_category_id'], 'item_cnt_block')
training['shop_category_me'] = get_mean_encoding(training, ['shop_id', 'item_category_id'], 'item_cnt_block')
training['shop_item_me'] = get_mean_encoding(training, ['shop_id', 'item_id'], 'item_cnt_block')
training['month_me'] = get_mean_encoding(training, ['month'], 'item_cnt_block')
training['block_me'] = get_mean_encoding(training, ['date_block_num'], 'item_cnt_block')



training.fillna(0,inplace=True)

In [24]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [11]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_units 3
item_block_mean 3
shop_block_units 3
shop_block_mean 3
cat_block_units 3
cat_block_mean 3
shop_cat_block_units 3
shop_cat_block_mean 3


In [55]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [12]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

item_block_units 1
item_block_mean 1
shop_block_units 1
shop_block_mean 1
cat_block_units 1
cat_block_mean 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_item_block_units 1
shop_item_block_mean 1


In [25]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean',
       'item_block_units_rolling_3', 'item_block_mean_rolling_3',
       'shop_block_units_rolling_3', 'shop_block_mean_rolling_3',
       'cat_block_units_rolling_3', 'cat_block_mean_rolling_3',
       'shop_cat_block_units_rolling_3', 'shop_cat_block_mean_rolling_3',
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'item_units', 'item_share_of_total_units', 'cat_units',
       'category_share_of_total_units

In [14]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','item_block_units', 'item_block_units_rolling_3',\
                                                   ]]

Unnamed: 0,item_id,shop_id,date_block_num,item_block_units,item_block_units_rolling_3
2371,30,30,12,58,0.0
2372,30,30,13,24,0.0
2373,30,30,14,31,41.0
2374,30,30,15,21,37.666668
2375,30,30,16,16,25.333334
2376,30,30,17,13,22.666666
2377,30,30,18,13,16.666666
2378,30,30,19,12,14.0
2379,30,30,20,11,12.666667
2380,30,30,21,13,12.0


In [15]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_block_units_rolling_3,item_block_mean_rolling_3,shop_block_units_rolling_3,shop_block_mean_rolling_3,cat_block_units_rolling_3,cat_block_mean_rolling_3,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,item_block_units_lag_1,item_block_mean_lag_1,shop_block_units_lag_1,shop_block_mean_lag_1,cat_block_units_lag_1,cat_block_mean_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1
754761,2899,25,14,0,25,7,0.145833,6151,0.861243,731,0.245632,36,0.580645,0,0,18.5,0.402174,6343.0,0.864282,939.5,0.362162,59.0,1.045896,13,0.282609,6268,0.878609,749,0.28566,49,0.859649,0,0.0
3925357,13893,46,24,0,55,14,0.28,1969,0.320736,9288,0.248342,171,0.22861,0,0,12.0,0.236923,2243.0,0.347911,10889.0,0.268899,174.666672,0.217906,14,0.28,2969,0.449508,13694,0.345808,200,0.252525,1,1.0
1370127,4954,3,27,0,76,2,0.042553,679,0.123996,143,0.049878,0,0.0,0,0,1.666667,0.033913,806.0,0.132229,171.666672,0.058346,0.0,0.0,1,0.021739,724,0.11904,155,0.059115,0,0.0,0,0.0
1431943,5142,52,13,0,67,3,0.065217,1495,0.20956,1882,0.210892,44,0.226804,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.043478,1573,0.208317,1387,0.154627,15,0.076923,0,0.0
3775788,13370,33,22,0,49,23,0.46,777,0.122904,760,0.174713,2,0.022989,0,0,15.666667,0.307536,424.666656,0.06832,804.0,0.195132,1.0,0.012459,17,0.326923,577,0.091096,888,0.221778,1,0.012987,0,0.0
969774,3606,42,13,0,55,5,0.108696,3253,0.455985,9446,0.195569,251,0.239048,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.065217,3605,0.47742,10216,0.204124,300,0.275735,0,0.0
1204699,4396,2,31,0,23,3,0.071429,942,0.184417,3642,0.555861,130,0.833333,0,0,3.333333,0.071076,750.0,0.14223,2769.333252,0.423546,63.333332,0.418649,3,0.06383,763,0.14334,2759,0.419365,82,0.535948,0,0.0
4386318,15296,49,26,0,63,15,0.326087,675,0.110983,2076,0.347157,28,0.215385,0,0,19.0,0.385106,1022.666687,0.162469,3124.333252,0.476623,51.0,0.383152,12,0.255319,738,0.121923,2130,0.321412,40,0.283688,1,1.0
3707554,13059,53,17,0,40,1,0.020408,1737,0.259991,15850,0.217825,193,0.129966,0,0,1.0,0.020994,1652.333374,0.240147,17413.666016,0.224149,221.666672,0.138054,1,0.020408,1605,0.24009,14765,0.195033,163,0.105502,0,0.0
1245859,4553,35,28,0,55,6,0.136364,1322,0.250237,6355,0.206922,184,0.26361,0,0,2.666667,0.056892,1465.666626,0.249539,8017.666504,0.230966,187.0,0.250637,2,0.042553,1346,0.2458,7156,0.213243,150,0.210084,0,0.0


In [65]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean',
       'item_block_units_rolling_3', 'item_block_mean_rolling_3',
       'shop_block_units_rolling_3', 'shop_block_mean_rolling_3',
       'cat_block_units_rolling_3', 'cat_block_mean_rolling_3',
       'shop_cat_block_units_rolling_3', 'shop_cat_block_mean_rolling_3',
       'item_block_units_lag_1', 'item_block_mean_lag_1',
       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
       'cat_block_units_lag_1', 'cat_block_mean_lag_1',
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
       'item_units', 'item_share_of_total_units', 'cat_units',
       'category_share_of_total_units

In [62]:
gc.collect()

ZEROS_KEEP=1.75


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']







x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']



In [136]:

features = [
    
    

       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
      
      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
    
      
       #'item_me', 'shop_me',
       #'shop_item_block_mean_rolling_3', 

   
    
    
]




In [137]:


gc.collect()
params =   {
    'objective' : 'reg:linear',
    'tree_method':'gpu_hist',
    #'gpu_id': 0,
    'learning_rate': 0.01, 
    #'gamma' : 0.3, 
    #'min_child_weight' : 3,
    #'nthread' : 16,
    #'max_depth' : 7,
    #'subsample' : 0.7, 
    #'colsample_bytree' : 0.7, 
    'seed':42, 
    'eval_metric' : "rmse",
    'num_boost_round' : 70000,
    #'n_estimators':999,
    #'max_leaves': 300
}


tr_data = xgb.DMatrix(x_train[features], y_train)
va_data = xgb.DMatrix(x_val[features], y_val)


watchlist = [(tr_data, 'train'), (va_data, 'valid')]

xg_model = xgb.train(params, tr_data, 700, watchlist, maximize=False, early_stopping_rounds = 10, verbose_eval=10)

[0]	train-rmse:1.20313	valid-rmse:1.15867
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[10]	train-rmse:1.16903	valid-rmse:1.13286
[20]	train-rmse:1.14026	valid-rmse:1.11139
[30]	train-rmse:1.11605	valid-rmse:1.09364
[40]	train-rmse:1.09571	valid-rmse:1.07898
[50]	train-rmse:1.07865	valid-rmse:1.0669
[60]	train-rmse:1.06435	valid-rmse:1.05704
[70]	train-rmse:1.05241	valid-rmse:1.04899
[80]	train-rmse:1.0424	valid-rmse:1.04238
[90]	train-rmse:1.03407	valid-rmse:1.03699
[100]	train-rmse:1.02711	valid-rmse:1.03263
[110]	train-rmse:1.02128	valid-rmse:1.02905
[120]	train-rmse:1.01638	valid-rmse:1.02615
[130]	train-rmse:1.01228	valid-rmse:1.02381
[140]	train-rmse:1.00882	valid-rmse:1.02188
[150]	train-rmse:1.00593	valid-rmse:1.02031
[160]	train-rmse:1.0035	valid-rmse:1.01902
[170]	train-rmse:1.00144	valid-rmse:1.01798
[180]	train-rmse:0.999718	valid-rmse:1.01713
[190]	train-rmse:0.998231	valid-

In [138]:
scores = xg_model.get_score(importance_type='gain')


sorted(scores.items(), key=lambda x: x[1])[::-1]

[('item_block_mean_lag_1', 19504.882229792027),
 ('shop_block_mean_lag_1', 3929.239545098096),
 ('shop_block_mean_rolling_3', 2095.1481993457382),
 ('item_block_mean_rolling_3', 840.9126235229342)]

In [48]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [57]:
features = [
    
    

       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
      
      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
    
      
       'item_me', 'shop_me',
       #'shop_item_block_mean_rolling_3', 

   
    
    
]


Unnamed: 0,item_cnt_block,shop_item_block_mean,shop_item_block_mean_lag_1,shop_item_block_mean_rolling_3
1184700,0,0,1.0,0.666667
471946,3,3,0.0,0.0
4716799,0,0,0.0,0.0
4986340,0,0,0.0,0.0
972458,0,0,0.0,0.333333
2749164,0,0,0.0,0.0
5728019,0,0,0.0,0.0
4382448,0,0,0.0,0.0
3153700,0,0,0.0,0.0
4655254,0,0,0.0,0.0


In [140]:
test            = pd.read_csv('test.csv.gz')

In [132]:
item_features = [ 
    'item_me'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')['item_me'].describe()

count    198324.0
mean     0.0     
std      0.0     
min      0.0     
25%      0.0     
50%      0.0     
75%      0.0     
max      0.0     
Name: item_me, dtype: float64

In [106]:
shop_features = [
        'shop_me'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [141]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')

item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]


In [142]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')

item_block_mean 1
shop_block_mean 1


In [143]:
test.fillna(0, inplace=True)

In [144]:
test.sample(10)

Unnamed: 0,ID,shop_id,item_id,item_block_mean_rolling_3,shop_block_mean_rolling_3,item_block_mean_lag_1,shop_block_mean_lag_1
187042,187042,34,14284,0.046713,0.081856,0.023256,0.082203
156393,156393,55,17598,0.0,0.28841,0.0,0.298722
97405,97405,19,1847,2.374207,0.261862,2.395349,0.234218
103925,103925,42,8110,0.0,0.664324,0.0,0.677483
4754,4754,5,18339,0.061663,0.213701,0.046512,0.2059
26685,26685,7,11252,0.257005,0.243393,0.162791,0.244444
181931,181931,38,14550,0.163034,0.254023,0.139535,0.250344
184261,184261,34,4102,0.334734,0.081856,0.232558,0.082203
65901,65901,22,1789,0.141037,0.222462,0.093023,0.245821
49313,49313,31,16262,0.0,1.005386,0.0,1.025959


In [145]:
xg_preds = xg_model.predict(xgb.DMatrix(test[features]))
xg_preds.clip(0,20,out=xg_preds)

array([1.6024166 , 0.39750373, 1.1375508 , ..., 0.04789776, 0.04789776,
       0.06536511], dtype=float32)

In [146]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = xg_preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)