## To Predict Total sales for every product and store in the next month

We are provided with daily historical sales data. 

***The task is to forecast the total amount of products sold in every shop for the test set.***

Note that the list of shops and products slightly changes ***every month***. Creating a robust model that can handle such situations is part of the challenge.

- Submissions are evaluated by root mean squared error (RMSE). 
- True target values are clipped into [0,20] range.

### Data fields

- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
- item_price - current price of an item
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., -October 2015 is 33
- item_name - name of item
- shop_name - name of shop
- item_category_name - name of item category

## 1. Feature Pre-processing

1. Use One-Hot-Vector for categorical features

In [None]:
import pandas as pd
import numpy as np
import gc, warnings
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize 

import numpy as np                   # Multi-dimensional array object
import pandas as pd                  # Data Manipulation
import seaborn as sns                # Data Visualization
import matplotlib.pyplot as plt      # Data Visualization
import plotly.express as px          # Interactive Data Visualization


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.



In [None]:
DATA_FOLDER = '../input/competitive-data-science-predict-future-sales/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

test            = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))
submission      = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))

In [None]:
items.shape

In [None]:
items.isnull().sum()

In [None]:
print(items.item_category_id.unique())
print(" ")
print("Number of Unique ID : ", len(items.item_category_id.unique()))

In [None]:
import plotly.express as px                                          # Interactive Data Visualization

fig = px.histogram(items, x = "item_category_id",
                  labels = {"item_category_id":"Category_id"},
                  title = "Item_Category_Id",
                  #color_discrete_sequence = ["Blue"]
                  )
                  
fig.show()

In [None]:
item_categories.head()

In [None]:
shops.head()

In [None]:
submission.head()

In [None]:
transactions

In [None]:
print("----------Top-5- Record----------")
print(transactions.head(5))
print()
print("-----------Information-----------")
print(transactions.info())
print()
print("-----------Data Types-----------")
print(transactions.dtypes)
print()
print("----------Missing value-----------")
print(transactions.isnull().sum())
print()
print("----------Null value-----------")
print(transactions.isna().sum())
print()
print("----------Shape of Data----------")
print(transactions.shape)

In [None]:
# Number of Duplicated Row
print('Number of duplicates:', len(transactions[transactions.duplicated()]))

In [None]:
test

In [None]:
# create all date from 2013.01.01 to 2015.10.31
full_period = []

# create period from 2013.01.01 to 2014.01.31
for year in range(2013, 2015):
    
    for month in range(1, 13):
        
        for day in range(1, 32):
            
            full_period.append(str(day)+"."+str(month)+"."+str(year))

# create period from 2015.01.01 to 2015.10.31
for year in range(2015, 2016):
    
    for month in range(1, 11):
        
        for day in range(1, 32):
            
            full_period.append(str(day)+"."+str(month)+"."+str(year))
            
day_list = ["31.2.2013", "30.2.2013", "29.2.2013", "31.4.2013", "31.6.2013", "31.9.2013", "31.11.2013",
            "31.2.2014", "30.2.2014", "29.2.2014", "31.4.2014", "31.6.2014", "31.9.2014", "31.11.2014",
            "31.2.2015", "30.2.2015", "29.2.2015", "31.4.2015", "31.6.2015", "31.9.2015",]

for w in day_list:

    full_period.remove(w)

## Add feature Russian_public_holiday

In [None]:
Russian_public_holiday = pd.DataFrame()

Full_TimeFrame = pd.DataFrame()

# List of all public hoilday between period 2013.01.01 to 2015.10.31
public_holiday = ['1.1.2013', '2.1.2013', '3.1.2013', '4.1.2013', '5.1.2013',
                  '6.1.2013', '7.1.2013', '8.1.2013', '9.1.2013', '10.1.2013',
                  '21.2.2013', '22.2.2013', '23.2.2013', '6.3.2013', '7.3.2013',
                  '8.3.2013', '1.5.2013', '2.5.2013', '3.5.2013', '8.5.2013',
                  '9.5.2013', '10.5.2013', '12.6.2013', '4.11.2013', 
                  '1.1.2014', '2.1.2014', '3.1.2014', '4.1.2014', '5.1.2014',
                  '6.1.2014', '7.1.2014', '8.1.2014', '9.1.2014', '10.1.2014',
                  '21.2.2014', '22.2.2014', '23.2.2014', '6.3.2014', '7.3.2014',
                  '8.3.2014', '1.5.2014', '2.5.2014', '3.5.2014', '8.5.2014',
                  '9.5.2014', '10.5.2014', '12.6.2014', '4.11.2014', 
                  '1.1.2015', '2.1.2015', '3.1.2015', '4.1.2015', '5.1.2015',
                  '6.1.2015', '7.1.2015', '8.1.2015', '9.1.2015', '10.1.2015',
                  '21.2.2015', '22.2.2015', '23.2.2015', '6.3.2015', '7.3.2015',
                  '8.3.2015', '1.5.2015', '2.5.2015', '3.5.2015', '8.5.2015',
                  '9.5.2015', '10.5.2015', '12.6.2015', 
                 ]

Full_TimeFrame['date'] = full_period

Russian_public_holiday['date'] = public_holiday

Russian_public_holiday['public_holiday'] = 1

# Merge item_category_id with main dataframe
Full_df = pd.merge(Full_TimeFrame, Russian_public_holiday, on=['date'], how='outer')


In [None]:
Full_TimeFrame

In [None]:
Russian_public_holiday

In [None]:
# Fill up non public_holiday to 0.0
Full_df = Full_df.fillna(0.0)

Full_df['year'] = pd.DatetimeIndex(Full_df['date']).year
Full_df['month'] = pd.DatetimeIndex(Full_df['date']).month
Full_df['day'] = pd.DatetimeIndex(Full_df['date']).day

Full_df.drop(['date'], axis=1, inplace=True)

Full_df

In [None]:
transactions['year'] = pd.DatetimeIndex(transactions['date']).year
transactions['month'] = pd.DatetimeIndex(transactions['date']).month
transactions['day'] = pd.DatetimeIndex(transactions['date']).day

In [None]:
transactions.head()

In [None]:
# Merge transactions with Full_df dataframe
transactions_with_public_holiday = pd.merge(transactions, Full_df, on=['year', 'month', 'day'],how='inner')

In [None]:
transactions_with_public_holiday.drop(['date'], axis=1, inplace=True)

In [None]:
transactions_with_public_holiday.isnull().sum()

## Revenue of Each Transaction

In [None]:
transactions_with_public_holiday['revenue'] = transactions_with_public_holiday['item_price'] * transactions_with_public_holiday['item_cnt_day']

In [None]:
transactions_with_public_holiday

## Average Item Price during the Period

In [None]:
Average_item_price = transactions_with_public_holiday.groupby(['shop_id','item_id'])['item_price'].mean().reset_index()

Average_item_price = Average_item_price.rename(columns={'item_price':'average_item_price'}, inplace=False)

In [None]:
transactions_with_average_item_price = pd.merge(transactions_with_public_holiday, Average_item_price,
                                                on=['shop_id','item_id'],
                                                how='inner'
                                               )

## Item Price compare to Average Item Price

In [None]:
transactions_with_average_item_price['item_price_changed'] = transactions_with_average_item_price['item_price'] - transactions_with_average_item_price['average_item_price']

In [None]:
transactions_with_average_item_price

## Create Table to Sum Up Sale for each item by month

In [None]:
sales_by_item_shop_id = transactions.pivot_table(index=['item_id','shop_id'],  # x-axis: item_id
                                            values=['item_cnt_day'],      # y-axis: month
                                            columns='date_block_num',     # x & y Intersection: Sale
                                            aggfunc=np.sum,               # Sum
                                            fill_value=0
                                           ).reset_index()

sales_by_item_shop_id.columns = sales_by_item_shop_id.columns.droplevel().map(str)

sales_by_item_shop_id = sales_by_item_shop_id.reset_index(drop=True).rename_axis(None, axis=1)

sales_by_item_shop_id.columns.values[0] = 'item_id'

sales_by_item_shop_id.columns.values[1] = 'shop_id'

sales_by_item_shop_id

## Downcasting 

In [None]:
def downcast_dtypes(df):
    
    float_cols = [c for c in df if df[c].dtype == "float64"]
    
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    
    df[float_cols] = df[float_cols].astype(np.float32)
    
    df[int_cols] = df[int_cols].astype(np.int16)
    
    return df

transactions = downcast_dtypes(transactions)

print(transactions.info())

In [None]:
sales_by_item_shop_id.sum()[2:].plot(legend=True, label="Monthly sum")

In [None]:
sales_by_item_shop_id.mean()[2:].plot(legend=True, label="Monthly mean")

## Let's see how many products and shops are outdated (no sales for the last 6 months)

In [None]:
# Number of outdated item in training set
outdated_items_and_shop = sales_by_item_shop_id[sales_by_item_shop_id.loc[:,'27':].sum(axis=1)==0]

print('Number of Zero Shop & Sale Item since month 27 :', len(outdated_items_and_shop))

In [None]:
outdated_items_and_shop

In [None]:
outdated_items_and_shop['id_shop_item'] = outdated_items_and_shop['shop_id'].astype(str).add('_').add(outdated_items_and_shop['item_id'].astype(str))

outdated_items_and_shop

In [None]:
# Number of outdated item and shop in test set (no sales for the last 6 months)

test['id_shop_item'] = test['shop_id'].astype(str).add('_').add(test['item_id'].astype(str))

print('Number of outdate item in test set :', len(test[test['id_shop_item'].isin(outdated_items_and_shop['id_shop_item'])]))

## Outliers by price and sales volume

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,4))

plt.xlim(transactions['item_cnt_day'].min(), transactions['item_cnt_day'].max())

sns.boxplot(x = transactions['item_cnt_day'])

print('Item_id with Sale volume outliers:',transactions['item_id'][transactions['item_cnt_day']>680].unique())



plt.figure(figsize=(10,4))

plt.xlim(transactions['item_price'].min(), transactions['item_price'].max())

sns.boxplot(x = transactions['item_price'])

print('Item_id with price outliers:',transactions['item_id'][transactions['item_price']>60000].unique())

In [None]:
# Cut off the outliers
transactions = transactions[transactions.item_price < 60000]

transactions = transactions[transactions.item_cnt_day < 680]

In [None]:
## -1 means a product that has  been returned. We are replacing it for now with the median
transactions[transactions['item_price']<0]

In [None]:
median = transactions[(transactions.shop_id==32) & (transactions.item_id==2973) & (transactions.date_block_num==4)].item_price.median()

transactions.loc[transactions.item_price<0, 'item_price'] = median

## Shop Closed - No Sales for last 6 months

In [None]:
sales_by_shop_id = transactions.pivot_table(index=['shop_id'],
                                            values=['item_cnt_day'], 
                                            columns='date_block_num', 
                                            aggfunc=np.sum, 
                                            fill_value=0
                                           ).reset_index()

sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)

sales_by_shop_id = sales_by_shop_id.reset_index(drop=True).rename_axis(None, axis=1)

sales_by_shop_id.columns.values[0] = 'shop_id'

for i in range(6,34):
    
    print('Not exists in month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,'0':str(i)].sum(axis=1)==0].unique())

for i in range(6,28):
    
    print('Shop is outdated for month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,str(i):].sum(axis=1)==0].unique())

## Analysis Test Set 

In [None]:
good_sales = test.merge(transactions, on=['item_id','shop_id'], how='left').dropna()
good_pairs = test[test['ID'].isin(good_sales['ID'])]
no_data_items = test[~(test['item_id'].isin(transactions['item_id']))]

print('1. Number of good pairs:', len(good_pairs))
print('2. No Data Items:', len(no_data_items))
print('3. Only Item_id Info:', len(test)-len(no_data_items)-len(good_pairs))

## Create Full (Train, Validation, and Test) Set

In [None]:
from itertools import product

Full_set = []

cols = ['date_block_num','shop_id','item_id']

# Create product of 3 columns into Full_set 
for i in range(34):
    
    month_num = transactions_with_average_item_price[transactions_with_average_item_price.date_block_num == i]   
    
    Full_set.append(np.array(list(product([i], month_num.shop_id.unique(), month_num.item_id.unique())), dtype='int16'))

    
    
# Convert array into dataframe   
Full_set = pd.DataFrame(np.vstack(Full_set), columns=cols)

Full_set['date_block_num'] = Full_set['date_block_num'].astype(np.int8)

Full_set['shop_id'] = Full_set['shop_id'].astype(np.int8)

Full_set['item_id'] = Full_set['item_id'].astype(np.int16)

Full_set.sort_values(cols,inplace=True)



In [None]:
Full_set

In [None]:
transactions_with_average_item_price

In [None]:
# Sum up daily item sold ('item_cnt_day') and merge into Full_set dataframe
group = transactions_with_average_item_price.groupby(['date_block_num',
                                                'shop_id',
                                                'item_id'
                                               ])['item_cnt_day'].sum().reset_index()

In [None]:
group = group.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=False)
group

In [None]:
Full_set = pd.merge(Full_set, group, on=cols, how='left')



Full_set['item_cnt_month'] = (Full_set['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)              # Set 'item_cnt_month' value minimum = 0, max = 20  
                                .astype(np.float16))     ## in the test set has also been clipped and therefore it is advisable

## set floats instead of ints for item_cnt_month to avoid downcasting it after concatination with the test set later. 
## If it would be int16, after concatination with NaN values it becomes int64, but foat16 becomes float16 even with NaNs. 



In [None]:
test = test.drop("id_shop_item", axis=1)

In [None]:
# Merge test into Full_set
test['date_block_num'] = 34

test['date_block_num'] = test['date_block_num'].astype(np.int8)

test['shop_id'] = test['shop_id'].astype(np.int8)

test['item_id'] = test['item_id'].astype(np.int16)

Full_set = pd.concat([Full_set, test], ignore_index=True, sort=False, keys=cols)

Full_set.fillna(0, inplace=True) # 34th month (prediction month)

In [None]:
Full_set

## Target Lags
- Lag is a time series concept where we hope try to get patters of sales from the previous months. 

- That is more of like saying does the total items sold in a given month have a pattern to the previous month or 2 months before or a year before.

- Now the question how do we know many months of lag should we take into consideration. This more of a trial and error thing. In SARIMA models we could plot ACF and PACF plots and consider how many months of lag that we need to take. Taking 12 months lag is saying what was the patten the same time the previous year.

- On how to choose lags is more of a trial and error and usually people consider a [1,2,3,6,12] lags period but this is just a wild guess and it is upto the individual to expeirment. 

Note: 
- Here we are joining the Full_set dataframe and shifted dataframe by columns of date_block_num,shop_id and item_id. 
- This would lead to a question whether a in a given month the shops and items sold wont be the same when compared to the previous month. 
- This is correct and this would lead to a lot Nan's. We would be dealing with this towards the end and turning them to mostly 0

In [None]:
def lag_feature(df, lags, col):
    
    tmp = df[['date_block_num','shop_id','item_id',col]]
    
    for i in lags:
        
        shifted = tmp.copy()
        
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        
        shifted['date_block_num'] += i
        
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        
    return df

In [None]:
Full_set_lag = lag_feature(Full_set, [1,2,3,6,12], 'item_cnt_month')

In [None]:
Full_set_lag

## Apply Mean Encoded Features

1. Average Sales per month
- The value of the previous months average sales as feature to current month

In [None]:
# Apply Mean Encoded into 'date_block_num'
group = Full_set_lag.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})

group.columns = [ 'date_avg_item_cnt' ]

group.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group, on=['date_block_num'], how='left')

Full_set_lag['date_avg_item_cnt'] = Full_set_lag['date_avg_item_cnt'].astype(np.float16)

# Use defined lag_feature function above
Full_set_lag = lag_feature(Full_set_lag, [1], 'date_avg_item_cnt')

Full_set_lag.drop(['date_avg_item_cnt'], axis=1, inplace=True)

2. Calculate the average number of items that are sold for a given item_id in a given date block

In [None]:
# Apply Mean Encoded into 'date_block_num' and 'item_id'
group_1 = Full_set_lag.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})

group_1.columns = [ 'date_item_avg_item_cnt' ]

group_1.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group_1, on=['date_block_num', 'item_id'], how='left')

Full_set_lag['date_item_avg_item_cnt'] = Full_set_lag['date_item_avg_item_cnt'].astype(np.float16)

# Use defined lag_feature function above 
Full_set_lag = lag_feature(Full_set_lag, [1,2,3,6,12], 'date_item_avg_item_cnt')

Full_set_lag.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

3. Calculate the average number of items that are sold for a given shop_id in a given date block

In [None]:
# Apply Mean Encoded into 'date_block_num' and 'shop_id'
group_2 = Full_set_lag.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})

group_2.columns = [ 'date_shop_avg_item_cnt' ]

group_2.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group_2, on=['date_block_num', 'shop_id'], how='left')

Full_set_lag['date_shop_avg_item_cnt'] = Full_set_lag['date_shop_avg_item_cnt'].astype(np.float16)

# Use defined lag_feature function above
Full_set_lag = lag_feature(Full_set_lag, [1,2,3,6,12], 'date_shop_avg_item_cnt')

Full_set_lag.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

## Add number of public holiday for each month as feature

In [None]:
public_holiday = ['1.1.2013', '2.1.2013', '3.1.2013', '4.1.2013', '5.1.2013',
                  '6.1.2013', '7.1.2013', '8.1.2013', '9.1.2013', '10.1.2013',
                  '21.2.2013', '22.2.2013', '23.2.2013', '6.3.2013', '7.3.2013',
                  '8.3.2013', '1.5.2013', '2.5.2013', '3.5.2013', '8.5.2013',
                  '9.5.2013', '10.5.2013', '12.6.2013', '4.11.2013', 
                  '1.1.2014', '2.1.2014', '3.1.2014', '4.1.2014', '5.1.2014',
                  '6.1.2014', '7.1.2014', '8.1.2014', '9.1.2014', '10.1.2014',
                  '21.2.2014', '22.2.2014', '23.2.2014', '6.3.2014', '7.3.2014',
                  '8.3.2014', '1.5.2014', '2.5.2014', '3.5.2014', '8.5.2014',
                  '9.5.2014', '10.5.2014', '12.6.2014', '4.11.2014', 
                  '1.1.2015', '2.1.2015', '3.1.2015', '4.1.2015', '5.1.2015',
                  '6.1.2015', '7.1.2015', '8.1.2015', '9.1.2015', '10.1.2015',
                  '21.2.2015', '22.2.2015', '23.2.2015', '6.3.2015', '7.3.2015',
                  '8.3.2015', '1.5.2015', '2.5.2015', '3.5.2015', '8.5.2015',
                  '9.5.2015', '10.5.2015', '12.6.2015', 
                 ]

holiday_month = [10, 3, 3, 0, 6, 1, 0, 0, 0, 0, 1, 0,
                 10, 3, 3, 0, 6, 1, 0, 0, 0, 0, 1, 0,
                 10, 3, 3, 0, 6, 1, 0, 0, 0, 0, 1
                ] 

holiday_num_month = []

for d in Full_set_lag['date_block_num']:
    
    holiday_num_month.append(holiday_month[d])


Full_set_lag['holiday_num_month'] = holiday_num_month
    

In [None]:
Full_set_lag.head()

4. Calculate the average number of items that are sold for a given public holiday in a given date block

In [None]:
# Apply Mean Encoded into 'date_block_num' and 'shop_id'
group_0 = Full_set_lag.groupby(['date_block_num', 'holiday_num_month']).agg({'item_cnt_month': ['mean']})

group_0.columns = [ 'date_holiday_avg_item_cnt' ]

group_0.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group_0, on=['date_block_num', 'holiday_num_month'], how='left')

Full_set_lag['date_holiday_avg_item_cnt'] = Full_set_lag['date_holiday_avg_item_cnt'].astype(np.float16)

# Use defined lag_feature function above
Full_set_lag = lag_feature(Full_set_lag, [1,2,3,6,12], 'date_holiday_avg_item_cnt')

Full_set_lag.drop(['date_holiday_avg_item_cnt'], axis=1, inplace=True)

In [None]:
Full_set_lag

## Trend Feature - Price trend for last six months

In [None]:
## Creating a column to get the average price for each item based on its item_id
group_3 = transactions_with_average_item_price.groupby(['item_id']).agg({'item_price': ['mean']})

group_3.columns = ['item_avg_item_price']

group_3.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group_3, on=['item_id'], how='left')

Full_set_lag['item_avg_item_price'] = Full_set_lag['item_avg_item_price'].astype(np.float16)



## Creating a column to get the average price for each item based on grouping by date_block_num and item_id
group_4 = transactions_with_average_item_price.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})

group_4.columns = ['date_item_avg_item_price']

group_4.reset_index(inplace=True)

Full_set_lag = pd.merge(Full_set_lag, group_4, on=['date_block_num','item_id'], how='left')

Full_set_lag['date_item_avg_item_price'] = Full_set_lag['date_item_avg_item_price'].astype(np.float16)

## This is an important step because here columns are created for a lag of 1-6 months based on the combination of date_block
## and item_id. Basically this tell us what was the price last month, last 2 months...upto last 6 months of the given item
## of a given date block. 
lags = [1,2,3,4,5,6]

Full_set_lag = lag_feature(Full_set_lag, lags, 'date_item_avg_item_price')

## Get the Price Trend

- That is it indicates the ratio of the avg item price (let it be x1) for the last month to the avg item price (x) for the whole set.

- Hence, for the last month trend_feature = (x1 - x)/x. for example, if last month price is greather then avg price, then the final value is more then zero (positive trend).

- if last month price is lesser than avg price, then the value is less than zero (negative trend). 

- Finaly, if last month price is close to avg price then the trend will be close to zero.

**because of the existence of items wich were not sold last month we use lags for date_item_avg_item_price property. Thus, if We cant get last month trend we take a trend for two months (x2-x)/x and so on.probably the name of the feature 'delta_price_lag' should be 'price_trend'.**

In [None]:
for i in lags:
    
    Full_set_lag['delta_price_lag_'+str(i)] = \
        (Full_set_lag['date_item_avg_item_price_lag_'+str(i)] - Full_set_lag['item_avg_item_price']) / Full_set_lag['item_avg_item_price']

In [None]:
# Create a new column to understand latest trend of a given product
def select_trend(row):
    
    for i in lags:
        
        if row['delta_price_lag_'+str(i)]:
            
            return row['delta_price_lag_'+str(i)]
        
    return 0

Full_set_lag['delta_price_lag'] = Full_set_lag.apply(select_trend, axis=1)

Full_set_lag['delta_price_lag'] = Full_set_lag['delta_price_lag'].astype(np.float16)

Full_set_lag['delta_price_lag'].fillna(0, inplace=True)

In [None]:
# Dropping columns such as date_item_avg_price_lag and delta_price_lag

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']

for i in lags:
    
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    
    fetures_to_drop += ['delta_price_lag_'+str(i)]

Full_set_lag.drop(fetures_to_drop, axis=1, inplace=True)

In [None]:
Full_set_lag

## Last month shop revenue trend

- Calculate the revenue trend and then create a lag for only the previous month.
- Unlike the previous case where we created it for the last months (1-6). This is obvious because ceach shops latest trend can be seen in the previous month of revenue as there wont be any shops with having a revenue the previous month unlike items that could have been sold the previous month or the one before or upto 6 months.

In [None]:
transactions_with_average_item_price.head()

In [None]:
## Grouping by date_block_num and shop_id and summing the total revenue
group_revenue = transactions_with_average_item_price.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})

group_revenue.columns = ['date_shop_revenue']

group_revenue.reset_index(inplace=True)

## Creating a column for the revenue date_shop_revenue
Full_set_lag = pd.merge(Full_set_lag, group_revenue, on=['date_block_num','shop_id'], how='left')
Full_set_lag['date_shop_revenue'] = Full_set_lag['date_shop_revenue'].astype(np.float32)


## Here we are grouping the revenue values by finding the mean over the shop id
group_revenue = group_revenue.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})

group_revenue.columns = ['shop_avg_revenue']

group_revenue.reset_index(inplace=True)


## Adding this to the Full_set_lag
Full_set_lag = pd.merge(Full_set_lag, group_revenue, on=['shop_id'], how='left')

Full_set_lag['shop_avg_revenue'] = Full_set_lag['shop_avg_revenue'].astype(np.float32)

## Doing the same as above that we did for price
Full_set_lag['delta_revenue'] = (Full_set_lag['date_shop_revenue'] - Full_set_lag['shop_avg_revenue']) / Full_set_lag['shop_avg_revenue']

Full_set_lag['delta_revenue'] = Full_set_lag['delta_revenue'].astype(np.float16)

## Adding only lag for the previous month
Full_set_lag = lag_feature(Full_set_lag, [1], 'delta_revenue')

Full_set_lag.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)

## Addition Feature - Number of Days for each month

In [None]:
Full_set_lag['month'] = Full_set_lag['date_block_num'] % 12

days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])

Full_set_lag['days'] = Full_set_lag['month'].map(days).astype(np.int8)

## Months since the first sale for each shop/item pair and for item only.

In [None]:
Full_set_lag['item_shop_first_sale'] = Full_set_lag['date_block_num'] - Full_set_lag.groupby(['item_id','shop_id'])['date_block_num'].transform('min')

Full_set_lag['item_first_sale'] = Full_set_lag['date_block_num'] - Full_set_lag.groupby('item_id')['date_block_num'].transform('min')

## Removing all rows from that are in the date_block_num for the first 11 months

Full_set_lag = Full_set_lag[matrix.date_block_num > 11]

In [None]:
Full_set_lag

In [None]:
# Fill nulls with zeros

def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

Full_set_lag = fill_na(Full_set_lag)

In [None]:
Full_set_lag.to_pickle('Full_set_lag.pkl')

## XGBoost

In [None]:
data = pd.read_pickle('Full_set_lag.pkl')

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']

X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']

X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)


In [None]:
from xgboost import plot_importance

import time
import sys
import gc
import pickle

## Note: We are clipping values here because the evaluation rules of the competition had said to clip the values
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission_211115.csv', index=False)

# save predictions for an ensemble
pickle.dump(Y_pred, open('xgb_train.pickle', 'wb'))
pickle.dump(Y_test, open('xgb_test.pickle', 'wb'))

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10,14))