In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < 1000)]['item_id'].unique()

#sums.groupby(pd.cut(sums["item_total_sales"], np.arange(0, 16000, 1000))).count()#.cumsum()


In [7]:
def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

In [8]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [9]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    dbn_combos = list(product(sales.shop_id.unique(), sales.item_id.unique(), [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [10]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [11]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [205]:
training["shop_cat"] = training["shop_id"].astype(str) + "_" + training["item_category_id"].astype(str)

In [208]:
training["shop_item"] = training["shop_id"].astype(str) + "_" + training["item_id"].astype(str)

In [12]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]
columns = ["item_id", "shop_id", "item_category_id", "month",  "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [102]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    #print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    total = training.iloc[in_fold_index].drop_duplicates(["item_id", "date_block_num"])['item_cnt_block'].sum()
    #print(total)
    fold_sum = training.iloc[in_fold_index].drop_duplicates(["item_id", "date_block_num"]).groupby("item_id")['item_cnt_block'].sum() *100 / total
    #print(fold_sum)
        #x_validation[column + "_mean_target"] = means\
    name = 'item_share_mean_encoding'
    training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index]["item_id"].map(fold_sum)
    i+=1

In [129]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
#columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    total = training.iloc[in_fold_index]['item_cnt_block'].sum()
    print(total)
    #print(total)
    fold_sum = training.iloc[in_fold_index].groupby("shop_id")['item_cnt_block'].sum() *100 / total
    #print(fold_sum)
        #x_validation[column + "_mean_target"] = means\
    name = 'shop_share_mean_encoding'
    training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index]["shop_id"].map(fold_sum)
    i+=1

fold 1
1479313
fold 2
1477524
fold 3
1478328
fold 4
1479973
fold 5
1479606


In [168]:
training['shares_comp_1'] = training['shop_share_mean_encoding'] * training['item_share_mean_encoding']

In [137]:
training['shop_share_mean_encoding'].sample(10)

3084838    1.775284
3650657    4.145494
1147000    1.678950
2130356    1.368201
5712745    1.797438
3441891    0.800799
1453019    2.157353
2955746    1.590093
3940641    6.478683
166357     2.184702
Name: shop_share_mean_encoding, dtype: float64

In [128]:
training.drop(columns=["shop_share_mean_encoding"],inplace=True)

In [25]:
sales_train["item_price_block_mean"] = sales_train.groupby(["item_id","date_block_num"])["item_price"].transform(np.mean)
sales_train["item_price_block_mean_max"] = sales_train.groupby("item_id")["item_price_block_mean"].transform(np.max)

training = training.merge(sales_train.drop_duplicates(["item_id", "date_block_num"])[["item_id", "date_block_num", "item_price_block_mean"]], how="left", on=["item_id", "date_block_num"])
training = training.merge(sales_train.drop_duplicates(["item_id"])[["item_id", "item_price_block_mean_max"]], how="left", on=["item_id"])

In [36]:
training["price_block_var_max"] = (training["item_price_block_mean_max"] - training["item_price_block_mean"]) / training["item_price_block_mean_max"]

In [41]:
def add_block_units_mean(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    
    try:
        df.drop(columns=[name_units, name_mean],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    gc.collect()
    return df


training = add_block_units_mean(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_mean(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_mean(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_mean(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block
shop_block
cat_block
shop_cat_block
shop_item_block


In [218]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

training['item_units'] = pd.to_numeric(training.groupby(['date_block_num'])['item_block_units'].transform(np.sum),downcast='unsigned')
training['cat_units'] = pd.to_numeric(training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum),downcast='unsigned')
training['shop_units'] = pd.to_numeric(training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum),downcast='unsigned')

training['item_share_of_total_units'] = pd.to_numeric(training['item_units'] * 100 / total_sales,downcast='float')
training['category_share_of_total_units'] = pd.to_numeric(training['cat_units'] * 100 / total_sales,downcast='float')
training['shop_share_of_units'] = pd.to_numeric(training['shop_units'] * 100 / total_sales,downcast='float')
training['shop_item_units'] = pd.to_numeric(training.groupby(['date_block_num'])\
                                            ['shop_item_block_units'].transform(np.sum),downcast='unsigned')

training['shop_item_share_of_total_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / total_sales,downcast='float')
training['shop_item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100\
                        / training['shop_units'],downcast='float')


training['item_share_of_shop_units'] = pd.to_numeric(training['shop_item_units'] * 100 / training['shop_units'],downcast='float')

training['shop_item_share_of_shop_units_mean'] = training.groupby('item_id')['shop_item_share_of_shop_units'].transform(np.mean)


number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [24]:
def add_min_max_quantiles(df, cols, name):
    print(name)

    block_name = name+'_block_units'
    units_name = name+'_units'
    max_name = name+'_max_units_block'
    min_name = name+'_min_units_block'
    
    try:
        df.drop(columns=[units_name, max_name, min_name, min_max_name],inplace=True)
    except:
        pass


    df[units_name] = pd.to_numeric(df.groupby(['date_block_num'])[block_name].transform(np.sum), downcast='unsigned')
    df[max_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.max), downcast='unsigned')
    df[min_name] = pd.to_numeric(df.groupby(cols)[block_name].transform(np.min), downcast='unsigned')
    


    for q in [0.25,0.50,0.75]:
        qname = name+'_minmax_q' + str(q)
        try:
            df.drop(columns=[qname],inplace=True)
        except:
            pass
        df[qname] =  pd.to_numeric(df[[min_name,max_name]].quantile(q,axis=1), downcast='unsigned')
        
    return df

training = add_min_max_quantiles(training, ['item_id'], 'item')
training = add_min_max_quantiles(training, ['shop_id'], 'shop')
training = add_min_max_quantiles(training, ['item_category_id'], 'cat')
training = add_min_max_quantiles(training, ['shop_id','item_category_id'], 'shop_cat')
training = add_min_max_quantiles(training, ['shop_id','item_id'], 'shop_item')

item
shop
cat
shop_cat
shop_item


In [42]:
def add_rolls(df, cols, name, rolls = [2]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
#training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
#training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

item_block_mean 2
shop_block_mean 2
cat_block_mean 2


In [26]:
training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3


In [56]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name or "var" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
#training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
#training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
#training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
#training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
#training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['item_id','date_block_num'], 'price_block_var_max')

In [28]:
training['shop_block_units_lag_comp1'] = pd.to_numeric(training['shop_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

#training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['item_block_units_lag_comp1'] = pd.to_numeric(training['item_block_units_lag_1'] * training['item_share_of_shop_units'],downcast='unsigned')

In [44]:
first_day = sales_train.groupby('item_id')['item_days_since_start'].min()
training['first_day'] = training['item_id'].map(first_day)

In [78]:
training["price_block_var_max_lag_1"].describe()

count    6.425094e+06
mean     1.245043e-01
std      1.991237e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.934114e-01
max      9.997499e-01
Name: price_block_var_max_lag_1, dtype: float64

In [138]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_price_block_mean', 'item_price_block_mean_max',
       'price_block_var_max', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean',
       'item_block_mean_rolling_2', 'shop_block_mean_rolling_2',
       'cat_block_mean_rolling_2', 'item_block_mean_lag_1',
       'shop_block_mean_lag_1', 'first_day', 'price_block_var_max_lag_1',
       'item_share_mean_encoding', 'shop_share_mean_encoding'], dtype=object)

In [77]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_price_block_mean,item_price_block_mean_max,price_block_var_max,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_block_mean_rolling_2,shop_block_mean_rolling_2,cat_block_mean_rolling_2,item_block_mean_lag_1,shop_block_mean_lag_1,first_day,price_block_var_max_lag_1
5658884,19339,33,26,0,49,3,0.20341,0.096296,0.191326,0.288238,0.274482,349.0,349.0,0.0,16,0.347826,483,0.079415,818,0.211698,1,0.011905,0,0,0.10383,0.141382,0.173803,0.12766,0.126549,3,0.0
3928224,13900,14,22,0,58,11,0.048062,0.175222,0.044658,0.310501,0.310501,1199.0,1349.0,0.111193,1,0.02,1327,0.209902,583,0.049617,6,0.025532,0,0,0.038846,0.178879,0.040489,0.057692,0.171298,4,0.111193
2399669,8648,27,13,0,40,2,0.025316,0.57031,0.239021,0.293999,0.308179,98.0,98.0,0.0,1,0.021739,3804,0.533221,18061,0.238972,502,0.305539,0,0,0.0,0.0,0.0,0.0,0.63581,16,0.0
5461740,18714,46,21,0,40,10,0.163695,0.296664,0.23891,0.270875,0.27966,149.0,149.0,0.0,8,0.153846,1742,0.275024,13334,0.22977,207,0.185484,0,0,0.238431,0.316886,0.228816,0.32,0.272937,187,0.030201
5964436,20355,16,29,0,72,6,0.334511,0.214053,0.236904,0.269761,0.263904,1249.0,1249.0,0.0,3,0.069767,1012,0.194018,1323,0.194731,14,0.088608,0,0,0.14265,0.186526,0.166604,0.136364,0.201211,1,0.0
1712310,6162,50,31,0,30,8,1.6341,0.245634,0.995954,0.289646,0.287298,699.0,699.0,0.0,11,0.261905,1061,0.207713,6138,0.902116,102,0.62963,0,0,0.255814,0.187593,0.786941,0.27907,0.205899,183,0.028404
527241,2135,16,16,0,40,5,0.059524,0.214527,0.238922,0.26409,0.268676,149.0,149.0,0.0,2,0.040816,1212,0.181301,14765,0.195033,195,0.126214,0,0,0.072491,0.188853,0.238706,0.040816,0.1646,2,0.0
5952771,20327,44,27,0,43,4,0.110159,0.168798,0.08117,0.251095,0.243817,399.0,399.0,0.0,1,0.021277,827,0.151023,400,0.068085,0,0.0,0,0,0.096207,0.149816,0.081886,0.043478,0.149129,3,0.0
4839535,16620,57,14,1,37,3,0.025723,0.558303,0.163651,0.289522,0.299465,299.0,499.0,0.400802,2,0.041667,4532,0.634556,6319,0.152368,377,0.436343,1,1,0.0,0.624228,0.172559,0.043478,0.654892,16,0.400802
254576,1114,48,25,0,37,2,0.166223,0.205743,0.164367,0.294767,0.280182,251.136154,299.0,0.16008,13,0.276596,1103,0.182224,4830,0.152926,55,0.081845,0,0,0.19,0.276224,0.209181,0.18,0.223,4,0.199383


In [260]:
from sklearn.preprocessing import StandardScaler

cols =  ['item_id_mean_encoding', 'shop_id_mean_encoding',
       'item_category_id_mean_encoding', 'month_mean_encoding',
       'date_block_num_mean_encoding', 'first_day']


training[cols] = StandardScaler().fit_transform(training[cols])

In [139]:
training.fillna(0, inplace=True)

In [170]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'date_block_num_mean_encoding',
       'item_price_block_mean', 'item_price_block_mean_max',
       'price_block_var_max', 'item_block_units', 'item_block_mean',
       'shop_block_units', 'shop_block_mean', 'cat_block_units',
       'cat_block_mean', 'shop_cat_block_units', 'shop_cat_block_mean',
       'shop_item_block_units', 'shop_item_block_mean',
       'item_block_mean_rolling_2', 'shop_block_mean_rolling_2',
       'cat_block_mean_rolling_2', 'item_block_mean_lag_1',
       'shop_block_mean_lag_1', 'first_day', 'price_block_var_max_lag_1',
       'item_share_mean_encoding', 'shop_share_mean_encoding',
       'shares_comp_1'], dtype=object)

In [169]:
gc.collect()

ZEROS_KEEP=0.25


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 31471
zeros_keep_indices_val 125884
non_zeros_val_indices 31471


In [264]:

features = [
    
    
        'item_category_id',
       'item_block_mean_rolling_3',
       'shop_block_mean_rolling_3',
           'shop_cat_block_mean_rolling_3',



      'item_block_mean_lag_1',
        'shop_block_mean_lag_1',
            'shop_cat_block_mean_lag_1',
    
    'shop_item_share_of_shop_units_mean',
    'shop_item_block_mean_rolling_3',
    'shop_item_block_mean_lag_1',
    
    'cat_me_real'

]




In [167]:
training.sample(20)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,item_price_block_mean,item_price_block_mean_max,price_block_var_max,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_block_mean_rolling_2,shop_block_mean_rolling_2,cat_block_mean_rolling_2,item_block_mean_lag_1,shop_block_mean_lag_1,first_day,price_block_var_max_lag_1,item_share_mean_encoding,shop_share_mean_encoding
4028410,14221,16,23,0,57,12,0.093284,0.216361,0.094046,0.445594,0.445594,272.6,299.0,0.088294,5,0.1,2332,0.353066,1003,0.118,0,0.0,0,0,0.118077,0.226394,0.086922,0.14,0.247074,4,0.0,0.0,1.582596
4572436,15895,7,15,0,69,4,0.028571,0.266643,0.262677,0.251095,0.25674,1399.0,1399.0,0.0,1,0.020408,1489,0.219811,524,0.157263,5,0.073529,0,0,0.0,0.287627,0.253145,0.0,0.266732,86,0.0,0.0,1.952873
2807412,10192,14,22,0,43,11,0.087312,0.172315,0.081178,0.313143,0.313143,249.0,249.0,0.0,6,0.12,1327,0.209902,614,0.081325,2,0.013245,0,0,0.107692,0.178879,0.0758,0.115385,0.171298,9,0.0,0.0,1.284356
3273021,11575,48,33,0,37,10,0.385496,0.204581,0.163651,0.269703,0.257867,149.0,199.0,0.251256,22,0.5,1055,0.194901,3967,0.256864,58,0.165242,0,0,0.342193,0.235212,0.196947,0.255814,0.214159,2,0.274098,0.0104,1.251278
2883512,10413,4,16,0,40,5,0.05102,0.16945,0.23939,0.266559,0.270215,58.0,98.0,0.408163,1,0.020408,1142,0.17083,14765,0.195033,119,0.077023,0,0,0.061862,0.157864,0.238706,0.061224,0.149247,11,0.0,0.0,1.254999
1025991,3814,18,30,0,55,7,0.11639,0.21808,0.225109,0.25591,0.257942,499.0,499.0,0.0,1,0.023256,977,0.183543,6475,0.216664,149,0.214388,0,0,0.080338,0.220587,0.207113,0.069767,0.219709,1,0.0,0.0,1.584654
4160158,14588,3,13,0,37,2,0.032328,0.127999,0.164367,0.294767,0.307412,244.5,299.0,0.182274,2,0.043478,871,0.122091,6026,0.15127,25,0.028868,0,0,0.0,0.0,0.0,0.0,0.127665,22,0.0,0.005217,0.939477
6277561,21685,18,23,0,58,12,0.0,0.216117,0.044431,0.445455,0.445455,1099.0,1099.0,0.0,1,0.02,1902,0.287964,808,0.060524,3,0.011236,0,0,0.0,0.185943,0.043664,0.0,0.210851,320,0.0,0.0,1.58799
4839905,16621,28,12,0,37,1,0.091185,0.687664,0.163438,0.31397,0.312757,299.0,299.0,0.0,6,0.130435,6134,0.812343,7517,0.193847,571,0.677343,0,0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,4.993467
3644293,12838,54,19,2,40,8,0.094949,0.757787,0.239021,0.288944,0.291233,198.914286,199.0,0.000431,7,0.137255,5445,0.855728,15631,0.233072,1223,0.930038,2,2,0.0,0.757244,0.207663,0.0,0.704129,184,0.0,0.005225,4.265494


In [135]:
training.fillna(0,inplace=True)

In [179]:
features = [
    
   'item_id_mean_encoding',
       'shop_id_mean_encoding',
    'item_category_id_mean_encoding', 
     # 'month_mean_encoding',
     #'shop_cat_mean_encoding',
       #'shop_item_mean_encoding',
    #'date_block_num_mean_encoding', 
    #'first_day',
      'item_block_mean_rolling_2',# 'item_block_mean_rolling_3',
       #'item_block_mean_rolling_6',#, 'item_block_mean_rolling_6',
       #'shop_block_mean_rolling_2', #'shop_block_mean_rolling_3',
       #'shop_block_mean_rolling_6',# 'shop_block_mean_rolling_6',
       #'cat_block_mean_rolling_2', 'cat_block_mean_rolling_3',
       #'cat_block_mean_rolling_4', 'cat_block_mean_rolling_6',
    'item_block_mean_lag_1', 
    'shop_block_mean_lag_1',
    'price_block_var_max_lag_1',
     'item_share_mean_encoding', 
    'shop_share_mean_encoding',
'shares_comp_1'
]


gc.collect()
lgtrain = lgbm.Dataset(x_train[features], label=y_train)
lgval = lgbm.Dataset(x_val[features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 16,
        #"device": "gpu",
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        #"max_bin": 10,#default 255
        #"num_leaves": 10, #default 31
        #"bagging_fraction": 0.3,
        #"bagging_freq": 1,
        #"min_data_in_leaf": 50000,
        "feature_fraction": 0.5,
        #"lambda_l2": 3,
        #"max_depth": 2,
        #"min_gain_to_split": 10,
        "learning_rate" : 0.01,
        #"histogram_pool_size": 1000,
        #"categorical_column": [0,1,2,3,4]
}

evals_result = {}
lg_model = lgbm.train(params, lgtrain, 20000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=1, 
                      verbose_eval=10, 
                      evals_result=evals_result)

scores = {}
for i,score in enumerate(lg_model.feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

Training until validation scores don't improve for 1 rounds.
[10]	valid_0's rmse: 1.34513
[20]	valid_0's rmse: 1.31126
[30]	valid_0's rmse: 1.28056
[40]	valid_0's rmse: 1.25846
[50]	valid_0's rmse: 1.23245
[60]	valid_0's rmse: 1.21283
[70]	valid_0's rmse: 1.19461
[80]	valid_0's rmse: 1.1797
[90]	valid_0's rmse: 1.16478
[100]	valid_0's rmse: 1.15322
[110]	valid_0's rmse: 1.14394
[120]	valid_0's rmse: 1.13591
[130]	valid_0's rmse: 1.1288
[140]	valid_0's rmse: 1.11935
[150]	valid_0's rmse: 1.1111
[160]	valid_0's rmse: 1.10496
[170]	valid_0's rmse: 1.10105
[180]	valid_0's rmse: 1.09614
[190]	valid_0's rmse: 1.09301
[200]	valid_0's rmse: 1.09032
[210]	valid_0's rmse: 1.08762
[220]	valid_0's rmse: 1.08452
[230]	valid_0's rmse: 1.08195
[240]	valid_0's rmse: 1.08006
[250]	valid_0's rmse: 1.07822
[260]	valid_0's rmse: 1.07648
[270]	valid_0's rmse: 1.07524
[280]	valid_0's rmse: 1.0734
[290]	valid_0's rmse: 1.07221
[300]	valid_0's rmse: 1.07088
[310]	valid_0's rmse: 1.07014
[320]	valid_0's rmse: 

[('item_category_id_mean_encoding', 1755),
 ('item_block_mean_lag_1', 1585),
 ('shop_id_mean_encoding', 1569),
 ('item_id_mean_encoding', 1354),
 ('shop_block_mean_lag_1', 1200),
 ('shop_share_mean_encoding', 1181),
 ('item_share_mean_encoding', 1083),
 ('item_block_mean_rolling_2', 1029),
 ('shares_comp_1', 821),
 ('price_block_var_max_lag_1', 753)]

In [265]:
cb_model = CatBoostRegressor(iterations=1000,
                             #learning_rate=0.05,
                             eval_metric='RMSE',
                             task_type = "GPU",
                             use_best_model=True,
                             od_type = "Iter",
                             od_wait = 1,
                             bagging_temperature = 30,
                             cat_features=[0],
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

0:	learn: 1.2142134	test: 1.3007728	best: 1.3007728 (0)	total: 208ms	remaining: 3m 27s
1:	learn: 1.2048024	test: 1.2913237	best: 1.2913237 (1)	total: 422ms	remaining: 3m 30s
2:	learn: 1.1935868	test: 1.2809283	best: 1.2809283 (2)	total: 612ms	remaining: 3m 23s
3:	learn: 1.1839645	test: 1.2724598	best: 1.2724598 (3)	total: 836ms	remaining: 3m 28s
4:	learn: 1.1767478	test: 1.2645890	best: 1.2645890 (4)	total: 1.01s	remaining: 3m 21s
5:	learn: 1.1705086	test: 1.2577772	best: 1.2577772 (5)	total: 1.22s	remaining: 3m 21s
6:	learn: 1.1622471	test: 1.2508170	best: 1.2508170 (6)	total: 1.4s	remaining: 3m 18s
7:	learn: 1.1528430	test: 1.2422691	best: 1.2422691 (7)	total: 1.62s	remaining: 3m 21s
8:	learn: 1.1469227	test: 1.2371112	best: 1.2371112 (8)	total: 1.83s	remaining: 3m 21s
9:	learn: 1.1379664	test: 1.2292195	best: 1.2292195 (9)	total: 2.03s	remaining: 3m 20s
10:	learn: 1.1309602	test: 1.2228338	best: 1.2228338 (10)	total: 2.22s	remaining: 3m 19s
11:	learn: 1.1239430	test: 1.2168717	best:

93:	learn: 0.9618758	test: 1.0653203	best: 1.0653203 (93)	total: 20.4s	remaining: 3m 16s
94:	learn: 0.9603815	test: 1.0641604	best: 1.0641604 (94)	total: 20.6s	remaining: 3m 15s
95:	learn: 0.9599992	test: 1.0639171	best: 1.0639171 (95)	total: 20.8s	remaining: 3m 15s
96:	learn: 0.9598076	test: 1.0637045	best: 1.0637045 (96)	total: 21s	remaining: 3m 15s
97:	learn: 0.9596032	test: 1.0636136	best: 1.0636136 (97)	total: 21.3s	remaining: 3m 16s
98:	learn: 0.9588431	test: 1.0626051	best: 1.0626051 (98)	total: 21.5s	remaining: 3m 16s
99:	learn: 0.9584068	test: 1.0621874	best: 1.0621874 (99)	total: 21.7s	remaining: 3m 15s
100:	learn: 0.9581137	test: 1.0617128	best: 1.0617128 (100)	total: 21.9s	remaining: 3m 15s
101:	learn: 0.9575106	test: 1.0612601	best: 1.0612601 (101)	total: 22.2s	remaining: 3m 15s
102:	learn: 0.9569574	test: 1.0608835	best: 1.0608835 (102)	total: 22.5s	remaining: 3m 15s
103:	learn: 0.9559245	test: 1.0600643	best: 1.0600643 (103)	total: 22.7s	remaining: 3m 15s
104:	learn: 0.9

185:	learn: 0.9294323	test: 1.0330034	best: 1.0330034 (185)	total: 40.8s	remaining: 2m 58s
186:	learn: 0.9286348	test: 1.0321979	best: 1.0321979 (186)	total: 41s	remaining: 2m 58s
187:	learn: 0.9282590	test: 1.0316546	best: 1.0316546 (187)	total: 41.2s	remaining: 2m 57s
188:	learn: 0.9281398	test: 1.0316287	best: 1.0316287 (188)	total: 41.4s	remaining: 2m 57s
189:	learn: 0.9281088	test: 1.0315856	best: 1.0315856 (189)	total: 41.6s	remaining: 2m 57s
190:	learn: 0.9279867	test: 1.0315170	best: 1.0315170 (190)	total: 41.9s	remaining: 2m 57s
191:	learn: 0.9279078	test: 1.0314832	best: 1.0314832 (191)	total: 42.1s	remaining: 2m 57s
192:	learn: 0.9271230	test: 1.0306199	best: 1.0306199 (192)	total: 42.3s	remaining: 2m 56s
193:	learn: 0.9269463	test: 1.0304205	best: 1.0304205 (193)	total: 42.5s	remaining: 2m 56s
194:	learn: 0.9268561	test: 1.0302560	best: 1.0302560 (194)	total: 42.9s	remaining: 2m 56s
195:	learn: 0.9266286	test: 1.0300495	best: 1.0300495 (195)	total: 43.1s	remaining: 2m 56s
1

[('item_block_mean_lag_1', 24.26178950360488),
 ('shop_item_block_mean_lag_1', 20.302278737207473),
 ('shop_item_block_mean_rolling_3', 11.060282556272488),
 ('shop_cat_block_mean_lag_1', 9.487496404150535),
 ('item_block_mean_rolling_3', 7.479725163588699),
 ('shop_cat_block_mean_rolling_3', 6.068712510620433),
 ('shop_item_share_of_shop_units_mean', 5.536159204862697),
 ('item_category_id', 5.138615825079335),
 ('cat_me_real', 4.480760033674159),
 ('shop_block_mean_lag_1', 3.542334724861135),
 ('shop_block_mean_rolling_3', 2.641845336078146)]

In [48]:
features = [item[0] for item in scores.items() if item[1] > 2000]

In [144]:
test            = pd.read_csv('test.csv.gz')
test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)
test['month'] = 11

In [145]:
item_features = [ 
    #'shop_item_share_of_shop_units_mean'
       'item_id_mean_encoding','first_day','price_block_var_max_lag_1','item_share_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [146]:
shop_features = [
        #'shop_me'
           'shop_id_mean_encoding','shop_share_mean_encoding'

]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [147]:
cat_features = [
        'item_category_id_mean_encoding'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [148]:
month_features = [
        'month_mean_encoding'
]

merge_col = ['month']
cols=month_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [312]:
shop_cat_features = [
        'shop_cat_mean_encoding'
]

merge_col = ['shop_id', 'item_category_id']
cols=shop_cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

KeyError: "['shop_cat_mean_encoding'] not in index"

In [173]:
shop_item_features = [
        'shares_comp_1'
]

merge_col = ['shop_id', 'item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [315]:
test["date_block_num_mean_encoding"] = training[training["date_block_num"] == 33]["date_block_num_mean_encoding"].mean()

In [149]:
def add_rolls_test(df, cols, name, rolls = [2]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
#test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
#test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 2
[['item_id', 'item_block_mean_rolling_2']]


In [225]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [150]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')


item_block_mean 1
shop_block_mean 1
shop_cat_block_mean 1


In [236]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 1


In [151]:
test.fillna(0, inplace=True)

In [152]:
test.sample(10)

Unnamed: 0,item_id,ID,shop_id,item_category_id,month,item_id_mean_encoding,first_day,price_block_var_max_lag_1,item_share_mean_encoding,shop_id_mean_encoding,shop_share_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,item_block_mean_rolling_2,item_block_mean_lag_1,shop_block_mean_lag_1,shop_cat_block_mean_lag_1
181328,18430,74025,21,55,11,0.131313,127.0,0.0,0.0052,0.294226,2.152059,0.224442,0.312264,0.115486,0.162791,0.339626,0.336957
165505,16272,132020,47,76,11,0.0125,74.0,0.0,0.0,0.303571,2.203959,0.058174,0.312264,0.0,0.0,0.340216,0.0
27983,3106,60049,25,58,11,0.036066,68.0,0.0,0.0,0.892821,6.539454,0.04475,0.312264,0.0,0.0,0.912291,0.38835
111813,11772,47072,31,40,11,0.169811,548.0,0.0,0.0,1.115029,8.219083,0.23891,0.312264,0.184197,0.209302,1.025959,1.363985
130173,13379,79759,15,49,11,0.034884,492.0,0.0,0.0,0.262132,1.947673,0.193016,0.312264,0.022992,0.023256,0.234808,0.0
2551,428,159831,56,45,11,0.0,0.0,0.0,0.0,0.301832,2.226299,0.06062,0.312264,0.0,0.0,0.227139,0.0
121621,12917,159943,56,41,11,0.270758,2.0,0.0,0.0052,0.301832,2.226299,0.144415,0.312264,0.437104,0.465116,0.227139,0.123288
165017,16237,209376,45,64,11,0.165829,491.0,0.0,0.0,0.155119,1.126881,0.296234,0.312264,0.206395,0.162791,0.12881,0.117117
18545,2244,120395,53,55,11,0.499391,1.0,0.0,0.020867,0.245937,1.801114,0.224442,0.312264,0.116015,0.209302,0.230678,0.267081
77987,8078,180886,38,37,11,0.323245,3.0,0.0,0.0,0.230296,1.695113,0.163675,0.312264,0.114165,0.046512,0.250344,0.099707


In [116]:
cb_preds = cb_model.predict(test[features])
cb_preds.clip(0,20,out=cb_preds)

NameError: name 'cb_model' is not defined

In [271]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

0.31713491408940697
11.488168775656959


In [180]:
lg_preds = lg_model.predict(test[features])
lg_preds.clip(0,20,out=lg_preds)

array([ 0.12275374,  0.10933448,  0.19876841, ...,  0.37588463,
        0.30116922,  0.32713306])

In [181]:
print(np.mean(lg_preds))
print(np.max(lg_preds))

0.32337496645
18.6660770782


In [294]:
print(np.mean(cb_preds))
print(np.max(cb_preds))

NameError: name 'cb_preds' is not defined

In [176]:
lg_preds[0:100]

array([ 0.07908165,  0.0634963 ,  0.15082108,  0.0459418 ,  0.04858499,
        0.09954873,  0.04873046,  0.08181469,  0.32883591,  0.76770589,
        0.09274791,  0.39107541,  0.07643415,  0.09745814,  0.11856599,
        0.09997729,  0.07519468,  0.08242333,  0.07538267,  0.10087557,
        0.23566746,  0.07527709,  0.0459418 ,  0.07733403,  0.07926696,
        0.12275062,  0.07827361,  0.29510159,  0.18239668,  0.07761547,
        0.05432266,  0.11267665,  0.04765718,  0.06559056,  0.10917819,
        0.07154058,  0.01490054,  0.11041081,  0.07310727,  0.07310426,
        0.04728614,  0.05917015,  0.15211987,  0.12699759,  0.32310179,
        0.10038549,  0.11522876,  0.21386266,  0.07838511,  0.1995819 ,
        0.5633854 ,  0.89312017,  0.1948569 ,  0.73396145,  0.18101508,
        0.21122307,  0.24597508,  0.19918643,  0.16885307,  0.19355932,
        0.14929483,  0.21600514,  0.48393435,  0.17279041,  0.10097897,
        0.19097236,  0.15812682,  0.25368054,  0.16294443,  0.46

In [182]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = lg_preds

submission.to_csv('submission.csv', index=False)

In [351]:
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,date_block_num_mean_encoding,first_day,item_block_units,item_block_mean,shop_block_units,shop_block_mean,cat_block_units,cat_block_mean,shop_cat_block_units,shop_cat_block_mean,shop_item_block_units,shop_item_block_mean,item_block_mean_rolling_2,item_block_mean_rolling_3,item_block_mean_rolling_4,item_block_mean_rolling_6,shop_block_mean_rolling_2,shop_block_mean_rolling_3,shop_block_mean_rolling_4,shop_block_mean_rolling_6,cat_block_mean_rolling_2,cat_block_mean_rolling_3,cat_block_mean_rolling_4,cat_block_mean_rolling_6
1664544,5983,58,27,0,30,4,0.039655,0.347876,0.991726,0.250437,0.242989,14,1,0.021277,1629,0.29748,7826,0.867243,209,1.088542,0,0,0.061739,0.061159,0.05087,0.043782,0.377824,0.382034,0.457419,0.414623,0.862122,0.904061,0.977607,1.024919
2885211,10424,5,13,0,37,2,0.084084,0.19446,0.163523,0.293575,0.307729,8,1,0.021739,1191,0.166947,6026,0.15127,59,0.068129,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5083670,17482,48,30,0,37,7,0.304575,0.206393,0.163523,0.256729,0.260998,1,5,0.116279,1094,0.205523,3291,0.156834,40,0.081967,0,0,0.264006,0.296572,0.30395,0.329158,0.191532,0.186673,0.187645,0.192634,0.157556,0.16818,0.167541,0.167832
2419986,8708,22,25,0,37,2,0.04,0.214979,0.163523,0.293575,0.277288,68,1,0.021277,1221,0.201718,4830,0.152926,72,0.107143,0,0,0.059412,0.046411,0.045012,0.043755,0.260791,0.255428,0.247499,0.236772,0.209181,0.191768,0.177641,0.163198
3798403,13436,16,22,1,11,11,0.061404,0.218308,0.260647,0.311852,0.311852,1,2,0.04,1562,0.247074,92,0.184,3,0.3,1,1,0.069231,0.072298,0.059223,0.056489,0.211654,0.218686,0.213326,0.208407,0.14014,0.159974,0.162481,0.191473
2276070,8248,31,15,1,40,4,0.054913,1.119358,0.239173,0.250437,0.256209,6,2,0.040816,7464,1.10186,17333,0.224309,1990,1.26189,1,1,0.063859,0.078804,0.078804,0.078804,1.145991,1.147343,1.147343,1.147343,0.246038,0.249882,0.249882,0.249882
1073356,3970,39,30,0,55,7,0.253191,0.122355,0.224271,0.257048,0.258174,244,7,0.162791,598,0.112343,6475,0.216664,63,0.090647,0,0,0.206395,0.222703,0.270288,0.266504,0.123662,0.120304,0.123071,0.12301,0.207113,0.209156,0.221755,0.225911
4027367,14220,10,22,0,57,11,0.085906,0.098287,0.09437,0.311019,0.311019,7,4,0.08,717,0.113413,697,0.085521,0,0.0,0,0,0.07902,0.079346,0.090122,0.1077,0.10184,0.100006,0.095387,0.094615,0.084507,0.092603,0.097591,0.100657
4113557,14434,7,14,0,40,3,0.103718,0.265655,0.238764,0.288169,0.299465,2,11,0.229167,1905,0.266732,20143,0.253104,289,0.174306,0,0,0.26087,0.26087,0.26087,0.26087,0.301063,0.301063,0.301063,0.301063,0.248271,0.248271,0.248271,0.248271
2458455,8820,55,15,0,65,4,0.178462,0.244115,0.653119,0.250581,0.255012,2,7,0.142857,998,0.147328,1838,0.72135,0,0.0,0,0,0.338768,0.349034,0.349034,0.349034,0.13302,0.13331,0.13331,0.13331,0.849364,0.914532,0.914532,0.914532


In [169]:
sales_train.sample(10)

Unnamed: 0,item_id,date,date_block_num,shop_id,item_price,item_cnt_day,day,month,year,item_category_id
635476,6501,16.02.2015,25,42,390.64,1.0,16,2,2015,28
1271389,16056,21.01.2014,12,25,699.0,1.0,21,1,2014,64
1123093,14124,03.07.2015,30,41,599.0,1.0,3,7,2015,41
982533,11797,28.12.2014,23,31,149.0,1.0,28,12,2014,41
307649,3566,03.02.2014,13,27,407.36,1.0,3,2,2014,23
1363132,17164,14.08.2014,19,49,399.0,1.0,14,8,2014,40
1081679,13491,19.08.2014,19,52,14990.0,1.0,19,8,2014,11
1537875,20243,13.06.2015,29,16,199.0,1.0,13,6,2015,40
884686,10207,06.08.2015,31,49,1199.0,1.0,6,8,2015,30
542943,5673,03.09.2014,20,28,799.0,1.0,3,9,2014,3
