In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from itertools import product
import gc
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load Data**

In [None]:
items_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_categories_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

# **EDA**

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

**Preprcoess item_categories_df**

In [None]:
# get the type of the items and label them
item_categories_df['type'] = item_categories_df['item_category_name'].str.split('-').map(lambda x:x[0])
item_categories_df['type_code'] = LabelEncoder().fit_transform(item_categories_df['type'])

# get the sub type of the items and label them
item_categories_df['sub_type'] = item_categories_df['item_category_name'].str.split('-').map(lambda x:x[1].strip() 
                                                                                             if len(x) > 1 else x[0].strip())
item_categories_df['sub_type_code'] = LabelEncoder().fit_transform(item_categories_df['sub_type'])
item_categories_df.head()

**Preprcoess shops_df**

In [None]:
shops_df.loc[shops_df['shop_name']=='Сергиев Посад ТЦ "7Я"','shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops_df['city'] = shops_df['shop_name'].str.split(' ').map(lambda x:x[0])
shops_df.loc[shops_df['city'] == '!Якутск','city']='Якутск'
shops_df['city_code'] = LabelEncoder().fit_transform(shops_df['city'])
shops_df.head()

**Preprcoess sales_df**

In [None]:
# replace the negative item_price to mean value
mean = sales_df[(sales_df["shop_id"] == 32) & (sales_df["item_id"] == 2973) & 
                (sales_df["date_block_num"] == 4) & (sales_df["item_price"] > 0)]["item_price"].mean()
sales_df.loc[sales_df.item_price < 0, 'item_price'] = mean

# clean the outliers
sales_df = sales_df[sales_df["item_price"] < np.percentile(sales_df["item_price"], q = 100)]
sales_df = sales_df[(sales_df["item_cnt_day"] >= 0) & (sales_df["item_cnt_day"] < np.percentile(sales_df["item_cnt_day"], q = 100))]

# change the format of date
sales_df["date"] = pd.to_datetime(sales_df["date"], format = "%d.%m.%Y")
sales_df.head()

In [None]:
for i in [(0, 57), (1, 58), (10, 11)]:
    sales_df.loc[sales_df['shop_id'] == i[0], 'shop_id'] = i[1]
    test_df.loc[test_df['shop_id'] == i[0], 'shop_id'] = i[1]

In [None]:
sales_df['revenue'] = sales_df['item_cnt_day'] * sales_df['item_price']

**Preprcoess items_df**

In [None]:
def name_correction(x):
    x = x.lower() #lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

In [None]:
# split item names by first bracket
items_df['name1'], items_df['name2'] = items_df['item_name'].str.split('[', 1).str
items_df['name1'], items_df['name3'] = items_df['item_name'].str.split('(', 1).str

# replace special characters and turn to lower case
items_df['name2'] = items_df['name2'].str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items_df['name3'] = items_df['name3'].str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

# fill nulls with '0'
items_df = items_df.fillna('0')

items_df['item_name'] = items_df['item_name'].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items_df['name2'] = items_df['name2'].apply(lambda x: x[:-1] if x != '0' else '0')

In [None]:
items_df['type'] = (items_df['name2'].apply(lambda x: x[0:8] if x.split(' ')[0] == 'xbox' else x.split(' ')[0]))

items_df.loc[(items_df['type'] == 'x360')|(items_df['type'] == 'xbox360')|(items_df['type'] == 'xbox 360'),'type'] = 'xbox 360'
items_df.loc[items_df['type'] == '', 'type'] = 'mac'
items_df.type = (items_df['type'].apply(lambda x: x.replace(' ', '')))
items_df.loc[(items_df['type'] == 'pc' )|(items_df['type'] == 'pс')|(items_df['type'] == 'pс'),'type'] = 'pс'

items_df.loc[items_df['type'] == 'рs3' , 'type'] = 'рs3'

In [None]:
group_sum = (items_df.groupby('type').agg({'item_id': 'count'}).reset_index())

drop_cols = []
for categ in group_sum['type'].unique():
    if group_sum.loc[(group_sum['type'] == categ), 'item_id'].values[0] <= 39:
        drop_cols.append(categ)

items_df['name2'] = (items_df['name2'].apply(lambda x: 'other' if x in drop_cols else x))
items_df = items_df.drop(['type'], axis=1)

In [None]:
items_df['name2'] = LabelEncoder().fit_transform(items_df['name2'])
items_df['name3'] = LabelEncoder().fit_transform(items_df['name3'])

items_df.drop(['item_name', 'name1'], axis=1, inplace=True)
items_df.head()

# **Preprocessing Data**

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for month in sales_df['date_block_num'].unique():
    cur_shops = sales_df.loc[sales_df['date_block_num'] == month, 'shop_id'].unique()
    cur_items = sales_df.loc[sales_df['date_block_num'] == month, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [month]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales_df.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
gb.rename(columns ={'item_cnt_day':'item_cnt_month'},inplace = True)
train_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

#Clip target values
train_data['item_cnt_month'] = np.clip(train_data['item_cnt_month'],0,20)
train_data.sort_values(index_cols, inplace = True)

# Downcast dtypes from 64 to 32 bit to save memory
train_data = downcast_dtypes(train_data)
del grid, gb 
gc.collect();

In [None]:
# combine test set by date_block_num
test_df.insert(loc=3, column='date_block_num', value=34)
train_data = train_data.append(test_df.drop('ID', axis = 1)).fillna(0)
train_data.head().append(train_data.tail())

**Combine tables**

In [None]:
# add shops_df
train_data = pd.merge(train_data,shops_df.drop(['city','shop_name'], axis = 1), on=['shop_id'], how='left') 

# add items_df
train_data = pd.merge(train_data, items_df, on=['item_id'], how='left') 

# add item_categories_df
train_data = pd.merge(train_data, item_categories_df.drop(['item_category_name','type','sub_type'], axis = 1), on=['item_category_id'], how='left') 

In [None]:
# add month, days, and holidays
train_data['month'] = train_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
train_data['days'] = train_data['month'].map(days)

In [None]:
train_data.head()

# **Feature Engineering**

**Lag Feature**

In [None]:
def generate_lag(df, lags, lag_col):
    for i in lags:
        shifted = df[['date_block_num','shop_id','item_id',lag_col]].copy()
        shifted.columns = ['date_block_num','shop_id','item_id', lag_col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
train_data = generate_lag(train_data, [1,2,3,6,12], 'item_cnt_month')

**Part one: lag feature with date_block_num**

In [None]:
group = train_data.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().rename('date_item_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'item_id'], how='left')
train_data = generate_lag(train_data, [1,2,3,6,12], 'date_item_avg_cnt')
train_data.drop(['date_item_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','shop_id'])['item_cnt_month'].mean().rename('date_shop_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'shop_id'], how='left')
train_data = generate_lag(train_data, [1,2,3,6,12], 'date_shop_avg_cnt')
train_data.drop(['date_shop_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','item_category_id'])['item_cnt_month'].mean().rename('date_itemcat_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num','item_category_id'], how='left')
train_data = generate_lag(train_data, [1,2], 'date_itemcat_avg_cnt')
train_data.drop(['date_itemcat_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num'])['item_cnt_month'].mean().rename('date_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num'], how='left')
train_data = generate_lag(train_data, [1], 'date_avg_cnt')
train_data.drop(['date_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','city_code'])['item_cnt_month'].mean().rename('date_city_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'city_code'], how='left')
train_data = generate_lag(train_data, [1], 'date_city_avg_cnt')
train_data.drop(['date_city_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','city_code','item_id'])['item_cnt_month'].mean().rename('date_city_item_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'city_code','item_id'], how='left')
train_data = generate_lag(train_data, [1], 'date_city_item_avg_cnt')
train_data.drop(['date_city_item_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','shop_id','item_id'])['item_cnt_month'].mean().rename('date_shop_item_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'shop_id','item_id'], how='left')
train_data = generate_lag(train_data, [1,2,3], 'date_shop_item_avg_cnt')
train_data.drop(['date_shop_item_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['date_block_num','shop_id','sub_type_code'])['item_cnt_month'].mean().rename('date_shop_subtype_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num', 'shop_id','sub_type_code'], how='left')
train_data = generate_lag(train_data, [1], 'date_shop_subtype_avg_cnt')
train_data.drop(['date_shop_subtype_avg_cnt'], axis=1, inplace=True)

In [None]:
#Cleaning works
train_data.fillna(0,inplace = True)
train_data = downcast_dtypes(train_data)
gc.collect()

**Part two: lag feature with month**

In [None]:
group = train_data.groupby(['month','item_id'])['item_cnt_month'].mean().rename('month_item_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['month', 'item_id'], how='left')
train_data = generate_lag(train_data, [1,2,3], 'month_item_avg_cnt')
train_data.drop(['month_item_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['month','shop_id'])['item_cnt_month'].mean().rename('month_shop_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['month', 'shop_id'], how='left')
train_data = generate_lag(train_data, [1,2,3], 'month_shop_avg_cnt')
train_data.drop(['month_shop_avg_cnt'], axis=1, inplace=True)

In [None]:
group = train_data.groupby(['month'])['item_cnt_month'].mean().rename('month_avg_cnt').reset_index()
train_data = pd.merge(train_data, group, on=['month'], how='left')
train_data = generate_lag(train_data, [1], 'month_avg_cnt')
train_data.drop(['month_avg_cnt'], axis=1, inplace=True)

In [None]:
#Cleaning works
train_data.fillna(0,inplace = True)
train_data = downcast_dtypes(train_data)
gc.collect()

**Trend Feature**

In [None]:
# price
# get the mean by item_id and item_price
group = sales_df.groupby(['item_id'])['item_price'].mean().rename('item_avg_price').reset_index()
train_data = pd.merge(train_data, group, on=['item_id'], how='left')

# get the mean by date_block_num, item_id and item_price
group = sales_df.groupby(['date_block_num','item_id'])['item_price'].mean().rename('date_item_avg_price').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num','item_id'], how='left')

# calculate the trend of price and add lag
lags = [1,2,3,4,5,6]
train_data = generate_lag(train_data, lags, 'date_item_avg_price')

for i in lags:
    train_data['trend_price_lag_'+str(i)] = \
        (train_data['date_item_avg_price_lag_'+str(i)] - train_data['item_avg_price']) / train_data['item_avg_price']
    
def select_trend(row):
    for i in lags:
        if row['trend_price_lag_'+str(i)]:
            return row['trend_price_lag_'+str(i)]
    return 0
    
train_data['trend_price_lag'] = train_data.apply(select_trend, axis=1)

# drop all the columns
fetures_to_drop = ['item_avg_price', 'date_item_avg_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_price_lag_'+str(i)]
    fetures_to_drop += ['trend_price_lag_'+str(i)]

train_data.drop(fetures_to_drop, axis=1, inplace=True)

In [None]:
# get the sum by date_block_num, shop_id and revenue
group = sales_df.groupby(['date_block_num','shop_id',])['revenue'].sum().rename('sum_date_shops_revenue').reset_index()
train_data = pd.merge(train_data, group, on=['date_block_num','shop_id'], how='left')

# get the mean by with shop_id and revenue
group = group.groupby(['shop_id',])['sum_date_shops_revenue'].mean().rename('mean_shops_revenue').reset_index()
train_data = pd.merge(train_data, group, on=['shop_id'], how='left')

# calculate the trend of revenue and add lag
train_data['trend_revenue'] = (train_data['sum_date_shops_revenue'] - train_data['mean_shops_revenue']) / train_data['mean_shops_revenue']
train_data = generate_lag(train_data, [1], 'trend_revenue')

# drop all the columns 
train_data.drop(['sum_date_shops_revenue'], axis=1, inplace=True)
train_data.drop(['mean_shops_revenue'], axis=1, inplace=True)
train_data.drop(['trend_revenue'], axis=1, inplace=True)

In [None]:
# add the month of each shop and item first sale
train_data['item_shop_first_sale'] = (
    train_data['date_block_num'] - train_data.groupby(['item_id', 'shop_id'])['date_block_num'].transform('min')
)
train_data['item_first_sale'] = (
    train_data['date_block_num'] - train_data.groupby(['item_id'])['date_block_num'].transform('min')
)

In [None]:
#Cleaning works
train_data.fillna(0,inplace = True)
train_data = downcast_dtypes(train_data)
gc.collect()

In [None]:
train_data = train_data[train_data['date_block_num'] > 3]
train_data.head().append(train_data.tail())

# **XGboost**

In [None]:
X_train = train_data[train_data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = train_data[train_data.date_block_num < 33]['item_cnt_month']
X_valid = train_data[train_data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = train_data[train_data.date_block_num == 33]['item_cnt_month']
X_test = train_data[train_data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
del shops_df
del items_df
del item_categories_df
del sales_df
del train_data

gc.collect()

In [None]:
from xgboost import XGBRegressor, plot_importance
import matplotlib.pyplot as plt

In [None]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42)

model.fit(
    X_train,
    Y_train,
    eval_metric="rmse",
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
    verbose=True,
    early_stopping_rounds = 10)


In [None]:
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

In [None]:
y_pred = model.predict(X_test).clip(0,20)
submission['item_cnt_month'] = y_pred 
submission.to_csv('future_sales_pred.csv', index=False)

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10, 14))