In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb

In [3]:
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train_v2.csv')
test            = pd.read_csv('test.csv')
sample_submission            = pd.read_csv('sample_submission.csv')

In [4]:
transactions = sales_train.copy()
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)



In [5]:
w_cat_ids = transactions.set_index('item_id').join(items.set_index('item_id'))
w_cat_ids = w_cat_ids.reset_index()
w_cat_ids['row_id'] = w_cat_ids.index

w_cat_ids['item_shop_count_month'] = w_cat_ids.groupby(['month', 'year', 'shop_id', 'item_id'])['item_cnt_day'].transform('sum')
#w_cat_ids['item_shop_count_month'] = w_cat_ids['item_shop_count_month'].clip(0 ,20)

In [6]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['item_id'] = test_w_cat_ids.index

In [7]:
# categories test
cat_counts_test = pd.DataFrame(test_w_cat_ids.groupby('item_category_id').size().reset_index(name="counts"))
cat_counts_test['percentage'] = cat_counts_test['counts'] * 100 / cat_counts_test.counts.sum()

In [8]:
# categories train
cat_counts = pd.DataFrame(w_cat_ids.groupby('item_category_id').size().reset_index(name="counts"))
cat_counts['percentage'] = cat_counts['counts'] * 100 / cat_counts.counts.sum()

In [9]:
combined  = pd.merge(cat_counts, cat_counts_test,  how='left', left_on=['item_category_id'], right_on = ['item_category_id'])
combined = combined.rename(index= str, columns={"counts_x": "total_train", "percentage_x": "%train", "counts_y": "total_test", "percentage_y": "%test"})

In [10]:
combined.sort_values('%test', ascending=False).head(15)

Unnamed: 0,item_category_id,total_train,%train,total_test,%test
40,40,564652,19.233006,32340.0,15.098039
55,55,339585,11.566841,28224.0,13.176471
37,37,192674,6.562803,13902.0,6.490196
31,31,20649,0.70334,11634.0,5.431373
58,58,13702,0.466713,9366.0,4.372549
72,72,47177,1.606929,8064.0,3.764706
61,61,12237,0.416813,6342.0,2.960784
47,47,5657,0.192687,6258.0,2.921569
23,23,146789,4.999882,6216.0,2.901961
19,19,208219,7.092293,5880.0,2.745098


In [11]:
shares = {}
for _, row in combined.iterrows():
    shares[row['item_category_id']] = row['%test']

no_nans = [v for v in shares.values() if not np.isnan(v)] 
#sorted(no_nans, reverse=True)

In [12]:
w_cat_ids[w_cat_ids.month.isin([9,10,11])].date_block_num.unique()

array([20, 21, 22,  8,  9, 10, 32, 33])

In [13]:
w_cat_ids = w_cat_ids.reset_index()

In [14]:
val_len = 25000

    
def get_share_of_group(group):
    #print(type(group))
    cat_id = np.unique(group[['item_category_id']])[0]
    share = shares[cat_id]

    #print(share)
    needed = val_len * share / 100
    #print(needed)

    if len(group) <= needed:
        print("Needed %d but only %d available for cat_id %d with share %f" % (needed, len(group), cat_id, share))
    #else:
        #print("OK")
    if np.isnan(needed):
        return group.head(0)
    return group.head(int(needed))


#val_indices = w_cat_ids[w_cat_ids['date_block_num'].isin([32,33])].groupby("item_category_id").apply(get_share_of_group).index
val_indices = w_cat_ids[w_cat_ids['date_block_num'].isin([32,33])].groupby("item_category_id").apply(get_share_of_group)['row_id'].values



Needed 4 but only 1 available for cat_id 27 with share 0.019608
Needed 171 but only 145 available for cat_id 45 with share 0.686275
Needed 1093 but only 820 available for cat_id 58 with share 4.372549
Needed 39 but only 33 available for cat_id 60 with share 0.156863
Needed 299 but only 241 available for cat_id 76 with share 1.196078
Needed 78 but only 66 available for cat_id 77 with share 0.313725
Needed 240 but only 134 available for cat_id 78 with share 0.960784


In [15]:
#test_w_cat_ids['item_shop_tuple'] = list(zip(test_w_cat_ids.item_id, test_w_cat_ids.shop_id))
#w_cat_ids['item_shop_tuple'] = list(zip(w_cat_ids.item_id, w_cat_ids.shop_id))
#w_cat_ids['cat_shop_tuple'] = list(zip(w_cat_ids.item_category_id, w_cat_ids.shop_id))

In [16]:
# item ids both in train/test
shared_ids = np.intersect1d(test_w_cat_ids.item_id, w_cat_ids.item_id)
perc = len(shared_ids) * 100 / len(w_cat_ids.item_id.unique())
print("percentage of shared item ids in train: ", perc)

perc = len(shared_ids) * 100 / len(test_w_cat_ids.item_id.unique())
print("percentage of shared item ids in test: ", perc)

percentage of shared item ids in train:  21.722382721144587
percentage of shared item ids in test:  92.88235294117646


In [17]:
#unique_test_tuples = test_w_cat_ids['item_shop_tuple'].unique()
#unique_train_tuples = w_cat_ids['item_shop_tuple'].unique()

In [18]:
bins = [0, 50, 100, 250, 500, 1000, 5000, 10000, 500000]

#w_cat_ids['price_binned'] = pd.cut(w_cat_ids['item_price'], bins)


In [19]:
w_cat_ids['total_number_of_items_in_shop'] = w_cat_ids.groupby('shop_id')['item_id'].transform('nunique')
w_cat_ids['total_number_of_items_in_cat'] = w_cat_ids.groupby('item_category_id')['item_id'].transform('nunique')
w_cat_ids['shop_share_of_all_sold_items'] = w_cat_ids.groupby('shop_id')['item_cnt_day'].transform('sum') / w_cat_ids.item_cnt_day.sum()
w_cat_ids['item_share_of_all_sold_items'] = w_cat_ids.groupby('item_id')['item_cnt_day'].transform('sum') / w_cat_ids.item_cnt_day.sum()
w_cat_ids['total_sell_of_item'] =  w_cat_ids.groupby('item_id')['item_cnt_day'].transform('sum') 
w_cat_ids['shop_sell_share_of_item'] = w_cat_ids.groupby(['item_id', 'shop_id'])['item_cnt_day'].transform('sum') / w_cat_ids['total_sell_of_item'] 
w_cat_ids['total_sell_of_cat'] =  w_cat_ids.groupby('item_category_id')['item_cnt_day'].transform('sum') 
w_cat_ids['shop_sell_share_of_cat'] = w_cat_ids.groupby(['item_category_id', 'shop_id'])['item_cnt_day'].transform('sum') / w_cat_ids['total_sell_of_cat']

In [20]:
w_cat_ids['shop_mean_month'] = w_cat_ids.groupby(['shop_id', 'month'])['item_cnt_day'].transform('sum') / w_cat_ids['total_number_of_items_in_shop']
w_cat_ids['cat_mean_month'] = w_cat_ids.groupby(['item_category_id', 'month'])['item_cnt_day'].transform('sum') / w_cat_ids['total_number_of_items_in_cat']  / w_cat_ids.shop_id.nunique()
w_cat_ids['item_mean_month'] = w_cat_ids.groupby(['item_id', 'month'])['item_cnt_day'].transform('sum') / w_cat_ids.shop_id.nunique()

In [21]:
lag = 1
lags = w_cat_ids.groupby(['item_id', 'date_block_num'])['item_mean_month'].first().groupby(level=[0,1]).shift(lag)
w_cat_ids.set_index(['item_id', 'date_block_num'], inplace=True)
w_cat_ids["item_mean_month_minus_1"]  = lags
w_cat_ids.reset_index(inplace=True)

In [22]:
lag = 1
lags = w_cat_ids.groupby(['item_category_id', 'date_block_num'])['cat_mean_month'].first().groupby(level=[0,1]).shift(lag)
w_cat_ids.set_index(['item_category_id', 'date_block_num'], inplace=True)
w_cat_ids["cat_mean_month_minus_1"]  = lags
w_cat_ids.reset_index(inplace=True)

In [27]:
lag = 1
lags = w_cat_ids.groupby(['shop_id', 'date_block_num'])['shop_mean_month'].first().groupby(level=[0,1]).shift(lag)
w_cat_ids.set_index(['shop_id', 'date_block_num'], inplace=True)
w_cat_ids["shop_mean_month_minus_1"]  = lags
w_cat_ids.reset_index(inplace=True)

In [28]:
import datetime
months = []
for i in range(1,13):
    months.append((i, str.lower(datetime.date(2008, i, 1).strftime('%B'))))
    
for i, month in months:
    w_cat_ids[month] = w_cat_ids.month == i
    
years = w_cat_ids.year.unique()
for year in years:
    w_cat_ids[year] = w_cat_ids.year == year

In [29]:
def get_coeffs_for_shop(shop_id, poly_degree=1, print_chart=False):

    shop = w_cat_ids[w_cat_ids.shop_id == shop_id].groupby('date_block_num')['item_cnt_day'].sum().reset_index(name ='item_cnt')

    datax = shop.date_block_num.values
    datay = shop.item_cnt.values

    z = np.polyfit(datax,datay, poly_degree) 
    p = np.poly1d(z)

    shop['poly'] = shop['date_block_num'].apply(lambda x: p(x))

    if print_chart:
        sns.set()
        fig, ax = plt.subplots()
        shop.plot(x='date_block_num', y='item_cnt', ax=ax, legend=False, figsize=(15,7))
        shop.plot(x='date_block_num', y='poly', ax=ax, legend=False, figsize=(15,7))

    return p

def get_coeffs_for_category(category_id, poly_degree=1, print_chart=False):

    cat = w_cat_ids[w_cat_ids.item_category_id == category_id].groupby('date_block_num')['item_cnt_day'].sum().reset_index(name ='item_cnt')

    datax = cat.date_block_num.values
    datay = cat.item_cnt.values

    z = np.polyfit(datax,datay, poly_degree) 
    p = np.poly1d(z)

    cat['poly'] = cat['date_block_num'].apply(lambda x: p(x))

    if print_chart:
        sns.set()
        fig, ax = plt.subplots()
        cat.plot(x='date_block_num', y='item_cnt', ax=ax, legend=False, figsize=(15,7))
        cat.plot(x='date_block_num', y='poly', ax=ax, legend=False, figsize=(15,7))

    return p

In [30]:
shop_ids = w_cat_ids.shop_id.unique()
slopes = {}

for shop_id in shop_ids:
    slope = get_coeffs_for_shop(shop_id)[0]
    slopes[shop_id] = slope

w_cat_ids['shop_slope'] = w_cat_ids.apply(lambda row: slopes[row['shop_id']], axis=1)

  
  


In [31]:
cat_ids = w_cat_ids.item_category_id.unique()
slopes = {}

for cat_id in cat_ids:
    slope = get_coeffs_for_category(cat_id)[0]
    slopes[cat_id] = slope

w_cat_ids['cat_slope'] = w_cat_ids.apply(lambda row: slopes[row['item_category_id']], axis=1)



In [32]:
w_cat_ids['item_shop_feature_1'] = w_cat_ids[['shop_mean_month', 'item_mean_month']].mean(axis=1)
w_cat_ids['item_shop_feature_2'] = w_cat_ids[['item_share_of_all_sold_items', 'shop_share_of_all_sold_items']].mean(axis=1)

In [33]:
w_cat_ids['shop_positive_trend'] = w_cat_ids.shop_slope > 0
w_cat_ids['cat_positive_trend'] = w_cat_ids.cat_slope > 0

In [34]:
global_mean = w_cat_ids.groupby('item_id')['item_cnt_day'].sum().mean()

w_cat_ids['tmp'] = w_cat_ids.groupby('item_id')['item_cnt_day'].transform('sum')

gc.collect()
# YOUR CODE GOES HERE
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=False)

split = list(fold.split(w_cat_ids))
folds = [te for tr, te in split]


for idx, fold in enumerate(folds):
    other_folds = w_cat_ids[~w_cat_ids.index.isin(fold)]
    mean = other_folds.groupby('item_id')['tmp'].mean()
    w_cat_ids.loc[fold,'mean_enc_kfold'] = w_cat_ids.loc[fold,:].index.map(mean)


# Fill NaNs
w_cat_ids['mean_enc_kfold'].fillna(global_mean, inplace=True)

In [35]:
w_cat_ids['mean_enc_kfold'].nunique()

1247

In [36]:
w_cat_ids.columns

Index([                      'shop_id',                'date_block_num',
                    'item_category_id',                       'item_id',
                               'index',                          'date',
                          'item_price',                  'item_cnt_day',
                                 'day',                         'month',
                                'year',                     'item_name',
                              'row_id',         'item_shop_count_month',
       'total_number_of_items_in_shop',  'total_number_of_items_in_cat',
        'shop_share_of_all_sold_items',  'item_share_of_all_sold_items',
                  'total_sell_of_item',       'shop_sell_share_of_item',
                   'total_sell_of_cat',        'shop_sell_share_of_cat',
                     'shop_mean_month',                'cat_mean_month',
                     'item_mean_month',       'item_mean_month_minus_1',
              'cat_mean_month_minus_1',       'shop

In [97]:
features = [                      'shop_id',                'date_block_num',
                    'item_category_id',                       'item_id',
                            
                          'item_price',                 
        'shop_share_of_all_sold_items',  'item_share_of_all_sold_items',
                     'shop_mean_month',                'cat_mean_month',
                     'item_mean_month',       'item_mean_month_minus_1',
             'shop_mean_month_minus_1',        'cat_mean_month_minus_1',
             'shop_sell_share_of_item',     
              'shop_sell_share_of_cat',                       'january',
                            'february',                         'march',
                               'april',                           'may',
                                'june',                          'july',
                              'august',                     'september',
                             'october',                      'november',
                            'december',                            2014,
                                  2015,                            2013,
                 'shop_slope',                     'cat_slope',
                 'item_shop_feature_1',           'item_shop_feature_2',
                             'shop_positive_trend',            'cat_positive_trend',
            'mean_enc_kfold'
           ]

In [98]:
x = w_cat_ids[features] 
y = w_cat_ids['item_shop_count_month']

In [99]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['item_id'] = test_w_cat_ids.index

In [100]:
x_train = x[~x.index.isin(val_indices)]
y_train = y[~y.index.isin(val_indices)]

x_val = x[x.index.isin(val_indices)]
y_val = y[y.index.isin(val_indices)]


In [101]:
#eval_set = [(x_val, y_val)]

#model_xgb = xgb.XGBClassifier()
#model_xgb.fit(x_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)

In [102]:
lgtrain = lgbm.Dataset(x_train, label=y_train, categorical_feature = [0,1,2,3])
lgval = lgbm.Dataset(x_val, label=y_val, categorical_feature = [0,1,2,3])



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 8,
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        "learning_rate" : 0.1,
      
        #"max_bin": 200,
        "num_leaves": 100
        #"num_leaves": 29,
        #"max_depth" : 24,
        #"bagging_fraction": 0.4,
        #"bagging_freq": 1,
        #"feature_fraction": 0.68,
        #"lambda_l1": 10,
}

In [103]:
evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 10000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result)



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.09925
[200]	valid_0's rmse: 3.03105
[300]	valid_0's rmse: 3.01563
[400]	valid_0's rmse: 2.99734
[500]	valid_0's rmse: 2.99212
Early stopping, best iteration is:
[489]	valid_0's rmse: 2.98974


In [None]:
test_w_cat_ids['date_block_num'] = 34
test_w_cat_ids['month'] = 11
test_w_cat_ids['year'] = 2015

In [None]:
item_level_data = w_cat_ids.drop_duplicates(['item_id'])[['item_id','item_price', 'item_mean_month', 'item_share_of_all_sold_items']]

test_w_cat_ids = pd.merge(test_w_cat_ids, item_level_data,  how='left', left_on=['item_id'], right_on = ['item_id'])

In [None]:
shop_level_data = w_cat_ids.drop_duplicates(['shop_id'])[['shop_id','shop_mean_month', 'total_number_of_items_in_shop', 'shop_share_of_all_sold_items']]

test_w_cat_ids = pd.merge(test_w_cat_ids, shop_level_data,  how='left', left_on=['shop_id'], right_on = ['shop_id'])

In [None]:
cat_level_data = w_cat_ids.drop_duplicates(['item_category_id'])[['item_category_id','cat_mean_month']]

test_w_cat_ids = pd.merge(test_w_cat_ids, cat_level_data,  how='left', left_on=['item_category_id'], right_on = ['item_category_id'])

In [None]:
test_w_cat_ids[features].head()

In [None]:
preds = np.clip(model_lgb.predict(test_w_cat_ids[features]), 0,20)

In [None]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds.astype(int)

submission.to_csv('submission.csv', index=False)

3.45 val > 3.70 submit
3.06 val > 3.65 submit