In [None]:
import numpy as np 
import pandas as pd 
import json
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
import re as re
from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sub_path_to_data = '/kaggle/input/competitive-data-science-predict-future-sales/'
categories = pd.read_csv(sub_path_to_data + 'item_categories.csv')
items = pd.read_csv(sub_path_to_data + 'items.csv')
sales_train = pd.read_csv(sub_path_to_data + 'sales_train.csv')
shops = pd.read_csv(sub_path_to_data + 'shops.csv')
test = pd.read_csv(sub_path_to_data + 'test.csv')

# Take an initial look onto the data

In [None]:
sales_train.head(1)

In [None]:
sales_train.shape

In [None]:
test.head(1)

In [None]:
test.shape

In [None]:
items.head(1)

In [None]:
items.shape

In [None]:
categories.head(1)

In [None]:
categories.shape

In [None]:
shops.head(1)

In [None]:
shops.shape

## Explore sales data

In [None]:
sales_train.info()

getting numerical columns info

In [None]:
sales_train.describe(include=[np.number]).T

checking for missing data

In [None]:
sales_train.isnull().sum()

checking for duplicates

In [None]:
duplicate_sales = sales_train.loc[sales_train.duplicated(keep=False)]

In [None]:
duplicate_sales

Those values could be real duplicates, or we just need to sum them to get the correct value of item_cnt_day.
After comparing score with dropping and summing duplicates, I see that score is better when keep duplicates and sume tham later

Another assumption is that we shouldhave only one row for each combination of 'date', 'shop_id', 'item_id'. Let's do this check with excluding already found duplicates:

In [None]:
sales_train.loc[sales_train.duplicated(subset= ['date', 'shop_id', 'item_id'], keep=False) &\
                ~sales_train.duplicated(keep=False)]

I see that sometimes the difference is in price, which looks reasonable. Maybe it's because of selling used/damaged/something else items. Need keep in mind it for later. Now adding 'same price' criteria to the duplicate search 

In [None]:
sales_train.loc[sales_train.duplicated(subset= ['date', 'shop_id', 'item_id', 'item_price'], keep=False) & ~sales_train.duplicated(keep=False)]

No other duplicates

So, we have nothing to drop 

In [None]:
sales_train.drop_duplicates(keep="first", inplace=True)

Some data cleaning

In [None]:
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

### Converting type of the date to work with it later

checking that all dates are in the correct format

In [None]:
sales_train.loc[~sales_train['date'].str.match('^[0-3]\d\.[0-1]\d\.20\d\d$')]

All dates are good. Converting:

In [None]:
sales_train['date'] =  pd.to_datetime(sales_train['date'], format='%d.%m.%Y')

### Taking initial look on the sales dynamics

We are interested in sales sums for each product separately per month, but it could be useful to take a look at the sum sales for all products and all shops

Check for outliers first

Check for outliers

Looking at dates with number of sales more than 7000

In [None]:
sales_sum = sales_train.groupby(['date'])['item_cnt_day'].sum().reset_index().set_index("date")

In [None]:
sales_sum.loc[sales_sum.item_cnt_day > 7000]

looks good

Checking for super high prices

In [None]:
sales_train.loc[sales_train.item_price > 50000]

Drop them

In [None]:
sales_train = sales_train.loc[sales_train.item_price < 50000]

Now preparing the diagram

In [None]:
sales_sum.plot(kind='bar', color='black', figsize=(24,6))

- we see some peaks
- May be there is a global trend of sales

Both things could be useful

#### Calculating values of sales per month

claculating monthly values and clipping to remove outliers

In [None]:
train = sales_train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()

In [None]:
train.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)

In [None]:
train['item_cnt_month'] = train['item_cnt_month'].fillna(0).clip(0,20)

Explore total number of sales for each shop

In [None]:
shops_list = train['shop_id'].unique()

fig, axs = plt.subplots(30,2, figsize=(20, 200), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.1)
axs = axs.ravel()

for index, shop_id in enumerate(shops_list):
    shop_sales = train.loc[train['shop_id'] == shop_id]
    sales_per_month = shop_sales.groupby('date_block_num')['item_cnt_month'].sum().to_frame()
    
    axs[index].plot(sales_per_month.index, sales_per_month['item_cnt_month'], 'o-')
    axs[index].set_xticks(sales_per_month.index)
    axs[index].grid()
    
    axs[index].title.set_text("sales for shop {0}".format(shop_id))
    axs[index].set_xlabel('month')
    axs[index].set_ylabel('number of sales')
    
plt.show()

Lookin at plots above we can say that:
1. For some shops data for some month is missing 
2. For some shops we have data only for the first two months. Need to check that those shops are in the test set
3. The age of the shops is very different
4. Total dynamic of sales is defferent from shop to shop
5. For some shops only one month provided
6. For some shopes last months not provided

I think it makes sense to remove shops 0,1,11 if they are not in the test set

In [None]:
shop_ids_test = test.shop_id.unique()
for shop_id in [0,1,11,20,8]:
    print (shop_id in shop_ids_test)

In [None]:
#train = train.loc[~(train.shop_id.isin([0,1,11,20,8]))]


### We can already add test data

In [None]:
test['date_block_num'] = 34
train = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, sort=False, keys=['date_block_num', 'shop_id', 'item_id'])
train.fillna(0, inplace=True)

### Mean encode item_id

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = [] 
for block_num in train.date_block_num.unique():
    cur_shops = train[train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = train[train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

train = pd.merge(grid,train,how='left',on=index_cols).fillna(0)

train.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [None]:
cumsum = train.groupby('item_id')['item_cnt_month'].cumsum() - train['item_cnt_month']
cumc = train.groupby('item_id').cumcount() + 1
train['item_target_enc'] = cumsum/cumc
train['item_target_enc'].fillna(0.3343, inplace=True) 

encoded_feature = train['item_target_enc'].values

### Add date related features

In [None]:
train['month'] = train['date_block_num'].map(lambda month: month-12*(month//12))

In [None]:
def number_of_weekens_in_month(first_day):
    ndays = first_day.daysinmonth
    weekends = 0
    for i in range(ndays):
        if (pd.to_datetime(first_day + timedelta(days=(np.long(i)))).dayofweek in [5, 6]): 
            weekends = weekends + 1
    return weekends

In [None]:
first_days_map = sales_train.groupby('date_block_num')['date'].min().map(lambda date: date.replace(day=1))

In [None]:
train['number_of_weekends'] = train['date_block_num'].map(first_days_map.map(number_of_weekens_in_month))

In [None]:
sales_train['month'] = sales_train['date_block_num'].map(lambda month: month-12*(month//12))

In [None]:
train['days_in_month'] = train['date_block_num'].map(first_days_map.map(lambda day: day.daysinmonth))

### Calculate target value for the previous months

In [None]:
def specify_the_accuracy_group(row):
    num_of_block = row.date_block_num - 1
    if (num_of_block != -1):
        values = train.loc[(train['shop_id']==row.shop_id) & (train['item_id']==row.item_id)&\
                         (train['date_block_num']==num_of_block),['item_cnt_month']].values
        return 0 if (values.size==0) else values[0][0]
    else:
        return 0

In [None]:
def previous_months_value(df, collumn, offset):
    previous_month_values = df.copy()
    previous_month_values['date_block_num'] = previous_month_values['date_block_num'] + offset
    previous_month_values.rename(columns={collumn: 'prev_month_' + str(offset) + '_' + collumn}, inplace=True)
    return pd.merge(df, previous_month_values[['date_block_num','shop_id','item_id','prev_month_' + str(offset) + '_' + collumn]], on=['date_block_num','shop_id','item_id'], how='left')

In [None]:
offsets = [1,2,3,9]

In [None]:
column_to_offset = 'item_cnt_month'

In [None]:
for offset in offsets:
    train = previous_months_value(train, column_to_offset, offset)
    values = {'prev_month_' + str(offset) + '_' + column_to_offset: 0}
    train = train.fillna(value=values)

### Add prices data

In [None]:
item_price_map = sales_train.loc[sales_train.item_price > 0].groupby(['item_id'])['item_price'].mean()

In [None]:
train['price'] = train['item_id'].map(item_price_map)

### Add feature 'new product'

In [None]:
first_month_product_appeared = sales_train.groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
first_month_product_appeared.rename(columns={'date_block_num': 'first_appeared'}, inplace=True)
train = pd.merge(train,first_month_product_appeared, on=['shop_id','item_id'], how='left')
train['first_appeared'] = train['date_block_num'] - train['first_appeared'] + 1
train['first_appeared'] = train['first_appeared'].map(lambda n: 0 if (n < 0) else n)
values = {'first_appeared': 0}
train = train.fillna(value=values)

### Add a featur 'new shop' to compare shops age

In [None]:
new_shop = train.groupby('shop_id')['date_block_num'].min().map(lambda month: 1 if (month > 10) else 0)

In [None]:
train['new_shop'] = train['shop_id'].map(new_shop)

### Add a feature 'incompleate data'

We saw previously, that data for some shops is incompleat. Use simple criteria, ofcause it's possible to find a better criteria, or select shops with incompleate data manually based on grafs above

In [None]:
sales_incompleate_data = train.groupby('shop_id')['date_block_num'].max().map(lambda month: 1 if (month < 33) else 0)

In [None]:
train['incompleate_data'] = train['shop_id'].map(sales_incompleate_data)

### Encode categorical features

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])

In [None]:
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

In [None]:
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x: x[0].strip())
categories['type_code'] = LabelEncoder().fit_transform(categories['type'])

categories['subtype'] = categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['subtype_code'] = LabelEncoder().fit_transform(categories['subtype'])
categories = categories[['item_category_id','type_code', 'subtype_code']]


In [None]:
train = pd.merge(train, shops, on=['shop_id'], how='left')

In [None]:
train.drop('month', axis=1, inplace=True)

In [None]:
train = pd.merge(train, items, on=['item_id'], how='left')
train = pd.merge(train, categories, on=['item_category_id'], how='left')
train['city_code'] = train['city_code'].astype(np.int8)
train['item_category_id'] = train['item_category_id'].astype(np.int8)
train['type_code'] = train['type_code'].astype(np.int8)
train['subtype_code'] = train['subtype_code'].astype(np.int8)

In [None]:
train.drop(['item_name'], axis=1, inplace=True)

### Predate data for training

In [None]:
X_train = train[train.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = train[train.date_block_num < 33]['item_cnt_month']
X_valid = train[train.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = train[train.date_block_num == 33]['item_cnt_month']
X_test = train[train.date_block_num == 34].drop(['item_cnt_month'], axis=1)

### Prepare simple baseline

In [None]:
baseline_preds = np.full((len(X_valid)), train.loc[train.date_block_num < 33, 'item_cnt_month'].mean())
rmse_b = np.sqrt(mean_squared_error(Y_valid.values, baseline_preds))
print("RMSE: %f" % (rmse_b))

### Train

In [None]:

parameters = {'tree_method': 'exact'}
model = XGBRegressor(
    max_depth=8,
    tree_method='exact',
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

### Predict test values

In [None]:
X_test.days_in_month = 30
X_test.number_of_weekends = 9

In [None]:
X_test = pd.merge(test.drop('ID', axis=1), X_test, on=['shop_id', 'item_id', 'date_block_num'], how='left')

In [None]:
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)