In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc

In [3]:
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train_v2.csv')
test            = pd.read_csv('test.csv')
sample_submission            = pd.read_csv('sample_submission.csv')

In [4]:
transactions = sales_train.copy()
transactions[['day','month', 'year']] = transactions['date'].str.split('.', expand=True).astype(int)



In [5]:
w_cat_ids = transactions.set_index('item_id').join(items.set_index('item_id'))
w_cat_ids['item_id'] = w_cat_ids.index

w_cat_ids['item_shop_count_month'] = w_cat_ids.groupby(['month', 'year', 'shop_id', 'item_id'])['item_cnt_day'].transform('sum')
w_cat_ids['item_shop_count_month'] = w_cat_ids['item_shop_count_month'].clip(0 ,20)

Defaulting to column, but this will raise an ambiguity error in a future version
  after removing the cwd from sys.path.


In [6]:
total_articles = len(w_cat_ids.item_id.unique())
total_shops = len(w_cat_ids.shop_id.unique())
total_categories = len(w_cat_ids.item_category_id.unique())
avg_item_per_cat = w_cat_ids.groupby('item_category_id')['item_id'].nunique().mean()
total_sold_items = w_cat_ids.item_cnt_day.sum()

In [7]:
w_cat_ids['total_articles_sold_by_shop'] = w_cat_ids.groupby(['shop_id'])['item_cnt_day'].transform('sum')
w_cat_ids['number_of_items_linked_to_shop'] = w_cat_ids.groupby(['shop_id'])['item_id'].transform("nunique")
w_cat_ids['shop_percentage_of_all_sold_items'] = w_cat_ids.groupby(['shop_id'])['item_cnt_day'].transform("sum") / total_sold_items
w_cat_ids['category_percentage_of_all_sold_items'] = w_cat_ids.groupby(['item_category_id'])['item_cnt_day'].transform("sum")  / total_sold_items
w_cat_ids['number_of_items_in_category'] = w_cat_ids.groupby(['item_category_id'])['item_id'].transform("nunique")


In [8]:
w_cat_ids['global_count_month_avg'] = w_cat_ids.groupby(['month', 'year'])['item_cnt_day'].transform('sum') / total_shops / total_articles
w_cat_ids['global_item_count_month_avg'] = w_cat_ids.groupby(['month', 'year', 'item_id'])['item_cnt_day'].transform('sum') * w_cat_ids.shop_percentage_of_all_sold_items
w_cat_ids['global_category_count_month_avg'] = w_cat_ids.groupby(['month', 'year', 'item_category_id'])['item_cnt_day'].transform('sum')  * w_cat_ids.category_percentage_of_all_sold_items\
/ total_shops / w_cat_ids.number_of_items_in_category
w_cat_ids['global_shop_count_month_avg'] = w_cat_ids.groupby(['month', 'year', 'shop_id'])['item_cnt_day'].transform('sum') * w_cat_ids.shop_percentage_of_all_sold_items\
/ w_cat_ids.number_of_items_linked_to_shop
w_cat_ids['category_shop_count_month_avg'] = w_cat_ids.groupby(['month', 'year', 'shop_id', 'item_category_id'])['item_cnt_day'].transform('sum') * w_cat_ids.category_percentage_of_all_sold_items / w_cat_ids.number_of_items_in_category


Defaulting to column, but this will raise an ambiguity error in a future version
  


In [9]:
w_cat_ids['item_shop_mean'] = w_cat_ids.groupby([w_cat_ids.item_id, w_cat_ids.shop_id])['item_shop_count_month'].transform(np.mean)

In [10]:
gc.collect()

cumsums = w_cat_ids.groupby(['shop_id','item_id', 'date_block_num'])['item_shop_count_month'].first().groupby(level=[0,1]).cumsum()
w_cat_ids.set_index(['shop_id','item_id', 'date_block_num'], inplace=True)
w_cat_ids['cumulative_item_shop_month'] = cumsums
w_cat_ids.reset_index(inplace=True)

Defaulting to column, but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
w_cat_ids = w_cat_ids.sort_values(['date_block_num', 'shop_id', 'item_id'])

In [12]:
for lag in [1,2,3]:
    lags = w_cat_ids.groupby(['shop_id','item_id', 'date_block_num'])['item_shop_count_month'].first().groupby(level=[0,1]).shift(lag)
    w_cat_ids.set_index(['shop_id','item_id', 'date_block_num'], inplace=True)
    w_cat_ids["item_shop_count_month_minus_%d" % (lag)]  = lags
    w_cat_ids.reset_index(inplace=True)

In [13]:
w_cat_ids['item_shop_count_month_avg_previous_3'] = w_cat_ids[['item_shop_count_month_minus_1', 'item_shop_count_month_minus_2', 'item_shop_count_month_minus_3']].mean(axis=1)

In [14]:
import datetime
months = []
for i in range(1,13):
    months.append((i, str.lower(datetime.date(2008, i, 1).strftime('%B'))))
    
for i, month in months:
    w_cat_ids[month] = w_cat_ids.month == i
    
years = w_cat_ids.year.unique()
for year in years:
    w_cat_ids[year] = w_cat_ids.year == year

In [15]:
def get_coeffs_for_shop(shop_id, poly_degree=1, print_chart=False):

    shop = w_cat_ids[w_cat_ids.shop_id == shop_id].groupby('date_block_num')['item_shop_count_month'].sum().reset_index(name ='item_cnt')

    datax = shop.date_block_num.values
    datay = shop.item_cnt.values

    z = np.polyfit(datax,datay, poly_degree) 
    p = np.poly1d(z)

    shop['poly'] = shop['date_block_num'].apply(lambda x: p(x))

    if print_chart:
        sns.set()
        fig, ax = plt.subplots()
        shop.plot(x='date_block_num', y='item_cnt', ax=ax, legend=False, figsize=(15,7))
        shop.plot(x='date_block_num', y='poly', ax=ax, legend=False, figsize=(15,7))

    return p.coefficients

In [16]:
shop_ids = w_cat_ids.shop_id.unique()
slopes = {}

for shop_id in shop_ids:
    slope = get_coeffs_for_shop(shop_id)[0]
    slopes[shop_id] = slope


  
  


In [17]:
w_cat_ids['shop_slope'] = w_cat_ids.apply(lambda row: slopes[row['shop_id']], axis=1)

In [18]:
w_cat_ids['positive_trend'] = w_cat_ids.shop_slope > 0

In [35]:
global_mean = w_cat_ids.groupby('item_id')['item_shop_count_month'].mean().mean()

In [36]:
gc.collect()
# YOUR CODE GOES HERE
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=False)

split = list(fold.split(w_cat_ids))
folds = [te for tr, te in split]


for idx, fold in enumerate(folds):
    other_folds = w_cat_ids[~w_cat_ids.index.isin(fold)]
    mean = other_folds.groupby('item_id')['item_shop_count_month'].mean()
    w_cat_ids.loc[fold,'mean_enc_kfold'] = w_cat_ids.loc[fold,:]['item_id'].map(mean)


# Fill NaNs
w_cat_ids['mean_enc_kfold'].fillna(global_mean, inplace=True) 

In [37]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['item_id'] = test_w_cat_ids.index

test_w_cat_ids.head()

Unnamed: 0_level_0,ID,shop_id,item_name,item_category_id,item_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,2587,5,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,30
30,7687,4,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,30
30,12787,6,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,30
30,17887,3,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,30
30,22987,2,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,30


In [38]:
item_shop_level_data = w_cat_ids.drop_duplicates(['item_id', 'shop_id'])[['item_id', 'shop_id', 'item_shop_mean']]

test_w_cat_ids = pd.merge(test_w_cat_ids, item_shop_level_data,  how='left', left_on=['item_id','shop_id'], right_on = ['item_id','shop_id'])

In [39]:
category_shop_level_data = w_cat_ids.drop_duplicates(['item_category_id', 'shop_id'])[['item_category_id', 'shop_id', 'category_shop_count_month_avg']]

test_w_cat_ids = pd.merge(test_w_cat_ids, category_shop_level_data,  how='left', left_on=['item_category_id','shop_id'], right_on = ['item_category_id','shop_id'])

In [40]:
shop_level_data = w_cat_ids.drop_duplicates(['shop_id'])
shop_level_data = shop_level_data.set_index('shop_id')

test_w_cat_ids['total_articles_sold_by_shop'] = test_w_cat_ids.shop_id.map(shop_level_data.total_articles_sold_by_shop)
test_w_cat_ids['number_of_items_linked_to_shop'] = test_w_cat_ids.shop_id.map(shop_level_data.number_of_items_linked_to_shop)
test_w_cat_ids['shop_percentage_of_all_sold_items'] = test_w_cat_ids.shop_id.map(shop_level_data.shop_percentage_of_all_sold_items)
test_w_cat_ids['global_shop_count_month_avg'] = test_w_cat_ids.shop_id.map(shop_level_data.global_shop_count_month_avg)

In [41]:
category_level_data = w_cat_ids.drop_duplicates(['item_category_id'])
category_level_data = category_level_data.set_index('item_category_id')

test_w_cat_ids['global_category_count_month_avg'] = test_w_cat_ids.item_category_id.map(category_level_data.global_category_count_month_avg)
test_w_cat_ids['category_percentage_of_all_sold_items'] = test_w_cat_ids.item_category_id.map(category_level_data.category_percentage_of_all_sold_items)
test_w_cat_ids['number_of_items_in_category'] = test_w_cat_ids.item_category_id.map(category_level_data.number_of_items_in_category)



In [42]:
item_level_data = w_cat_ids.drop_duplicates(['item_id'])
item_level_data = item_level_data.set_index('item_id')

test_w_cat_ids['global_item_count_month_avg'] = test_w_cat_ids.item_id.map(item_level_data.global_item_count_month_avg)

In [43]:
test_w_cat_ids['global_count_month_avg'] = w_cat_ids[w_cat_ids.month == 11].groupby(['year'])['item_cnt_day'].transform('sum') / total_shops / total_articles

In [44]:
gc.collect()

w_cat_ids['cumsums_nov'] = w_cat_ids.groupby(['shop_id','item_id', 'date_block_num'])['cumulative_item_shop_month'].transform('last') + w_cat_ids.global_item_count_month_avg


cumsums_nov = w_cat_ids.drop_duplicates(['shop_id', 'item_id'])[['shop_id', 'item_id', 'cumsums_nov']]
test_w_cat_ids = pd.merge(test_w_cat_ids, cumsums_nov,  how='left', left_on=['item_id','shop_id'], right_on = ['item_id','shop_id'])
test_w_cat_ids['cumulative_item_shop_month'] = test_w_cat_ids.cumsums_nov

In [45]:
gc.collect()
 
item_shop_level_data = w_cat_ids[w_cat_ids.date_block_num.isin([33])][['item_id', 'shop_id', 'date_block_num', 'item_shop_count_month', 'item_shop_count_month_minus_1',\
               'item_shop_count_month_minus_2','item_shop_count_month_minus_3', 'item_shop_count_month_avg_previous_3']].drop_duplicates(['shop_id', 'item_id'])

test_w_cat_ids = pd.merge(test_w_cat_ids, item_shop_level_data,  how='left', left_on=['item_id','shop_id'], right_on = ['item_id','shop_id'])


test_w_cat_ids.item_shop_count_month_minus_1 = test_w_cat_ids['item_shop_count_month']
test_w_cat_ids.item_shop_count_month_minus_2 = test_w_cat_ids['item_shop_count_month_minus_1']
test_w_cat_ids.item_shop_count_month_minus_3 = test_w_cat_ids['item_shop_count_month_minus_2']

test_w_cat_ids['item_shop_count_month_avg_previous_3'] = test_w_cat_ids[['item_shop_count_month_minus_1', 'item_shop_count_month_minus_2', 'item_shop_count_month_minus_3']].mean(axis=1)

In [46]:
test_w_cat_ids.columns

Index(['ID', 'shop_id', 'item_name', 'item_category_id', 'item_id',
       'item_shop_mean', 'category_shop_count_month_avg',
       'total_articles_sold_by_shop', 'number_of_items_linked_to_shop',
       'shop_percentage_of_all_sold_items', 'global_shop_count_month_avg',
       'global_category_count_month_avg',
       'category_percentage_of_all_sold_items', 'number_of_items_in_category',
       'global_item_count_month_avg', 'global_count_month_avg', 'cumsums_nov',
       'cumulative_item_shop_month', 'date_block_num', 'item_shop_count_month',
       'item_shop_count_month_minus_1', 'item_shop_count_month_minus_2',
       'item_shop_count_month_minus_3',
       'item_shop_count_month_avg_previous_3'],
      dtype='object')

In [47]:
x = w_cat_ids.drop(['item_price', 'date', 'item_cnt_day', 'day', 'month', 'year', 'item_name', 'item_shop_count_month'], axis=1)  
y = w_cat_ids['item_shop_count_month']

In [48]:
val_indices = w_cat_ids[w_cat_ids.date_block_num.isin([32,33])].index

x_train = x[~x.index.isin(val_indices)]
y_train = y[~y.index.isin(val_indices)]

x_val = x[x.index.isin(val_indices)]
y_val = y[y.index.isin(val_indices)]


In [49]:
lgtrain = lgbm.Dataset(x_train, label=y_train)
lgval = lgbm.Dataset(x_val, label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 8,
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        "learning_rate" : 0.05,
        #"num_leaves": 29,
        #"max_depth" : 24,
        #"bagging_fraction": 0.4,
        #"bagging_freq": 1,
        #"feature_fraction": 0.68,
        #"lambda_l1": 10,
}

In [50]:
evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 50000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.52031
[200]	valid_0's rmse: 1.40593
[300]	valid_0's rmse: 1.35467
[400]	valid_0's rmse: 1.32828
[500]	valid_0's rmse: 1.31036
[600]	valid_0's rmse: 1.29825
[700]	valid_0's rmse: 1.28826
[800]	valid_0's rmse: 1.28141
[900]	valid_0's rmse: 1.27462
[1000]	valid_0's rmse: 1.27002
[1100]	valid_0's rmse: 1.26547
[1200]	valid_0's rmse: 1.26014
[1300]	valid_0's rmse: 1.25707
[1400]	valid_0's rmse: 1.25283
[1500]	valid_0's rmse: 1.25124
[1600]	valid_0's rmse: 1.24871
[1700]	valid_0's rmse: 1.24862
[1800]	valid_0's rmse: 1.24608
[1900]	valid_0's rmse: 1.24489
[2000]	valid_0's rmse: 1.24431
[2100]	valid_0's rmse: 1.24367
[2200]	valid_0's rmse: 1.24383
[2300]	valid_0's rmse: 1.24309
[2400]	valid_0's rmse: 1.24258
[2500]	valid_0's rmse: 1.24146
[2600]	valid_0's rmse: 1.24113
[2700]	valid_0's rmse: 1.24015
[2800]	valid_0's rmse: 1.24004
Early stopping, best iteration is:
[2751]	valid_0's rmse: 1.23952


In [51]:
preds = model_lgb.predict(test_w_cat_ids)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields item_name

In [None]:
submission = test.loc[:,['ID']]
submission['Prediction'] = pred.astype(int)

submission.to_csv('submission.csv', index=False)