In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PREDICT FUTURE SALES
## INDEX
1. Introduction
2. Import Libraries
3. Data Overwiew
4. Exploratory Data Analysis
* Basic Analysis
* Analysis On Date
* Analysis on Number of Units Sold(item_cnt_day)
* Analysis On Item Price
* Q. How does sales over each day looks like?
* Q. Do price of an item changer over time?
* Analysis On Shop_id, item_id and item_category_id
* Q. Do all item ids and shop ids present in train is present in test?
* Q. Which shop id, item id and item category id have maximum sales?
* Q. Are there any items which are more than in one category.ie Do an item belongs to single category or not?
* Q. What about Monthly Sales?
* Q. Do all the shops sold items on all months from 2013 Jan to 2015 Oct?
5. Modeling

* Train validation Split
* Checking Coorelation Among Train Features
* Baseline models

* Random Forest Model
* Xgboost Model
* Submission

# 1. Introduction
## Understanding Business Objective
Here we will work with a challenging time-series dataset consisting of daily sales data, provided by one of the largest Russian software firms - 1C Company. We are given sales for 34 months from 2013-Jan to 2015 October.We need to predict no of each of the item that will gets sold in the month of november 2015 for given shop.ie, We will have a shop_id(unique identifier of a shop) and an item_id(unique identifier of an item) and we have to predict the number of units that item will gets sold in the month of november.


Note: Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.


# 2. Importing Libraries


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.gridspec as gridspec
from termcolor import colored
import gc

plt.style.use('seaborn-whitegrid')
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# 3. Data Overview
We have mainly 5 files:

1. train.csv  -> showing item price and number of items sold on each date. 
2. shop.csv -> Details of shops corresponding to shop id in train.csv.
3. item.csv -> Details of items corresponding to item id in train.csv
4. item_categories.csv -> Details of item category corresponding to category id in item.csv
5. test.csv -> test data for prediction

Let us give an glimpse on each of the files

train.csv
--------

- date: date in format dd/mm/yy.
- date_block_num: a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1 and so on.
- shop_id: unique identifier of a shop.
- item_id: unique identifier of a product.
- item_price: current price of an item.
- item_cnt_day: number of products sold. We are predicting a monthly amount of this measure.


shop.csv
--------
- shop_name: shop name corresponding to shop id in train.csv
- shop_id

item.csv
--------
- item_name: item name corresponding to item id in train.csv
- item_id
- item_category_id: category id of item


item_category.csv
-----------------
- item_category_name: category name of item cooresponding to item_category_id in item.csv 
- item_category_id

test.csv
----------
- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id
- item_id

# 4. Exploratory Data Analysis
Basic Analysis

In [None]:
# file paths
DIR_PATH = '/kaggle/input/competitive-data-science-predict-future-sales'
TRAIN_SALES_CSV = '/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv'
SHOPS_CSV = '/kaggle/input/competitive-data-science-predict-future-sales/shops.csv'
ITEMS_CSV= '/kaggle/input/competitive-data-science-predict-future-sales/items.csv'
ITEM_CATEGORY_CSV = '/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv'
TEST_CSV = '/kaggle/input/competitive-data-science-predict-future-sales/test.csv'

In [None]:
def show_data(df_train,heading='TRAIN DATA'):
    
    """
    function which gives basic data information
    Args:
        df_train: pandas dataframe
        heading: deading to display
    Returns:
        None
    """

    print(colored(heading,'red'))
    print('')
    print('Date shape')
    print(f'shape:{df_train.shape}')
    print('')
    print('--'*50)
    print('')
    print('Sample:')
    print(df_train.head(3).to_markdown())
    print('')
    print('--'*50)
    print('')
    print('Columns and data types:')
    print('')
    print(df_train.info())

In [None]:
df_train = pd.read_csv(TRAIN_SALES_CSV)
df_shop = pd.read_csv(SHOPS_CSV)
df_item = pd.read_csv(ITEMS_CSV)
df_itemcat = pd.read_csv(ITEM_CATEGORY_CSV)

show_data(df_train,heading='TRAIN DATA')
print('')
print('__'*40)
print('')
show_data(df_shop,heading='SHOP DATA')
print('')
print('__'*40)
print('')
show_data(df_item,heading='ITEM DETAILS DATA')
print('__'*40)
print('')
show_data(df_itemcat,heading='ITEM CATEGORY DATA')

In [None]:
df_test = pd.read_csv(TEST_CSV)
show_data(df_test,heading='TEST DATA')

## Observation:
As discussed earlier for test data we just have shop id and item id. Train data is spread over 4 files. From above samples we know that there are common ids on multiple files.It will be great to move forward once we merge those train files based on common id.

In [None]:
### Merging all dataframes together
dff = df_train.merge(df_item,on="item_id")
dff = dff.merge(df_itemcat,on="item_category_id")
dff = dff.merge(df_shop,on="shop_id")
dff = dff.drop(columns=["item_name"])
dff.to_csv('merged_original.csv',index=False)

In [None]:
#masking a copy
df = dff.copy()
# df.to_csv('merged_original.csv',index=False)
df.head()

In [None]:
df.describe()

Ok. Now we have our merged data. Let us go through each of the features


## Analysis On Date

In [None]:
df["date"]=  pd.to_datetime(df["date"], format='%d.%m.%Y')
df.sort_values(by="date", ascending=True, inplace=True)
print(f'Minimum data present: {df["date"].min()}')
print(f'Maximum date present: {df["date"].max()}')

In [None]:
# sorting dataframe based on date
df = df.sort_values(by='date').reset_index(drop=True)
df.head()

Observation:

- Our 'date' was not in date time format.We have converted it to datetime and sorted our dataframe based on that.

## Analysis on number of units sold (item_cnt_day)
item_cnt_day is the number of units of that item sold in that shop on a particular day.

In [None]:
fig,axes = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(df['item_cnt_day'],ax=axes[0])
axes[0].set_title('Boxplot')
sns.distplot(df['item_cnt_day'],ax=axes[1])
axes[1].set_title('Distribution')
plt.suptitle('No of units sold(Item Cnt day)',fontsize="20")
plt.show()

In [None]:
df['item_cnt_day'].describe()

In [None]:
print(f'Minimum value {df["item_cnt_day"].min()}')
print(f'Maximum value {df["item_cnt_day"].max()}')

In [None]:
# Let us print percentile values
for i in range(0,101,10):
    print(f'{i}th percentile value for item_cnt_day is {np.percentile(df["item_cnt_day"],i)}')
    
print('--'*50)

for i in range(90,100):
    print(f'{i}th percentile value for item_cnt_day is {np.percentile(df["item_cnt_day"],i)}')
    
print('--'*50)

for i in range(1,10):
    k = 99 + i/10 
    print(f'{k}th percentile value for item_cnt_day is {np.percentile(df["item_cnt_day"],k)}')

In [None]:
# we will remove some extreme out layers
df[df['item_cnt_day'] > df['item_cnt_day'].quantile(0.95)]

In [None]:
df[df['item_cnt_day'] < 0]

In [None]:
print(f'shape of data after before outliers: {df.shape}')
df = df[df['item_cnt_day'] >= 0]
upper_quantile = df['item_cnt_day'].quantile(0.95)
print(f'Removing values greater that upper_quantile {upper_quantile} and less than 0')
df['item_cnt_day'] = np.where(df['item_cnt_day'] > upper_quantile, upper_quantile, df['item_cnt_day'])

print(f'shape of data after removing outliers: {df.shape}')
print(f'Minimum units of product sold a time {df["item_cnt_day"].min()}')
print(f'Maximum units of product sold a time {df["item_cnt_day"].max()}')

Observation:
- Most of the items are only sold 1 in quantity.Even 75th percentile value is 1.
- We have some values less that zeros which ideally should not present. ie, no of units sold cannot be less than or equal to zero.Also we have some extreme outliers. After some analysis, we decide to remove all those values > 0.95 percentile.

## Analysis On Item Price

In [None]:
fig,axes = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(df['item_price'],ax=axes[0])
axes[0].set_title('Boxplot')
sns.distplot(df['item_price'],ax=axes[1])
axes[1].set_title('Distribution')
plt.suptitle('Item Price per unit',fontsize="20")
plt.show()

In [None]:
df['item_price'].describe()

In [None]:
# Let us print percentile values
for i in range(0,101,10):
    print(f'{i}th percentile value for item_price is {np.percentile(df["item_price"],i)}')
    
print('--'*50)

for i in range(90,100):
    print(f'{i}th percentile value for item_price is {np.percentile(df["item_price"],i)}')
    
print('--'*50)

for i in range(1,10):
    k = 99 + i/10 
    print(f'{k}th percentile value for item_price is {np.percentile(df["item_price"],k)}')

In [None]:
# we have an extreme outlier value in item price. Let us remove it

print(f'shape of data before removing outliers: {df.shape}')
df = df[df['item_price'] >= 0]
upper_quantile = df['item_price'].quantile(0.95)
df['item_price'] = np.where(df['item_price'] > upper_quantile, upper_quantile, df['item_price'])
print(f'shape of data after removing outliers: {df.shape}')

print(f'Minimum price of a single item {df["item_price"].min()}')
print(f'Maximum price ofa single item {df["item_price"].max()}')

Observation:
- We have some item price less that zeros which ideally should not present.
- Also we have some extreme outliers. After some analysis, we decide to remove all those values > 0.95 percentile similar to item_cnt_day.
- After removing outliers, We have some products with price low as 0.07(Might be some chocolate). Also we have some item which costs about 2683(Might be a chair)

Question:
Do price of an item develop over time?

Before getting in to let us take a sample of items which are getting sold on each day. Also it is important to note that there might be price change for the same item in different shops.So let us consider price development of sample of items in the same shop.

In [None]:
df_tmp = df[df['shop_id'] == 31][['date','item_id','item_price']].reset_index(drop=True)
items = df_tmp['item_id'].unique()[0:8]

fig,axes = plt.subplots(1,1,figsize=(25,8))
colors = ['red','orange','blue','green','yellow','purple','cyan','brown']
for i,item in enumerate(items):
    dprice = df_tmp[df_tmp['item_id'] == item][['item_price','date']]
    
    sns.lineplot(x=dprice['date'],y=dprice['item_price'],ax=axes,color=colors[i],label=item)
    
axes.set_title('Price development of items - shop_id 31',fontsize="28")
axes.legend()
plt.show()


df_tmp = df[df['shop_id'] == 28][['date','item_id','item_price']].reset_index(drop=True)
items = df_tmp['item_id'].unique()[0:8]

fig,axes = plt.subplots(1,1,figsize=(25,8))
colors = ['red','orange','blue','green','yellow','purple','cyan','brown']
for i,item in enumerate(items):
    dprice = df_tmp[df_tmp['item_id'] == item][['item_price','date']]
    
    sns.lineplot(x=dprice['date'],y=dprice['item_price'],ax=axes,color=colors[i],label=item)
    
axes.set_title('Price development of items - shop_id 28',fontsize="28")
axes.legend()
plt.show()



df_tmp = df[df['shop_id'] == 21][['date','item_id','item_price']].reset_index(drop=True)
items = df_tmp['item_id'].unique()[0:8]

fig,axes = plt.subplots(1,1,figsize=(25,8))
colors = ['red','orange','blue','green','yellow','purple','cyan','brown']
for i,item in enumerate(items):
    dprice = df_tmp[df_tmp['item_id'] == item][['item_price','date']]
    
    sns.lineplot(x=dprice['date'],y=dprice['item_price'],ax=axes,color=colors[i],label=item)
    
axes.set_title('Price development of items - shop_id 21',fontsize="28")
axes.legend()
plt.show()

Observation:
- We can see that price of the same item is not same over the time(for some items).
- Some items are also not selling on shops after some time.(might have stopped selling those product)
- It increases and decreases over time.
- During feature engineering it may be useful, if we can flag some items whose price remain constant over time
- Here we can see that item price is dependent on time. This is the beauty of time series.

Question:
How does sales over each day looks like?

Let us see how sales distribution looks for a single day. Before that let us create a new feature which is the turn over for an item.We can consider it as total sales for an item from a particular shop on a particular day.

In [None]:
# creating a new feature
df['Sales_per_item'] = df['item_cnt_day'] * df['item_price']

In [None]:
fig = plt.figure(figsize=(25,7))
gs = fig.add_gridspec(1, 3)
ax00 = fig.add_subplot(gs[0,0])
ax01 = fig.add_subplot(gs[0,1])
ax02 = fig.add_subplot(gs[0,2])
# setting size of xlabel and ylabel
ax00.tick_params(axis='both', labelsize=15)
ax01.tick_params(axis='both', labelsize=15)
ax02.tick_params(axis='both', labelsize=15)
ax00.set_title('Sales per item', fontsize=20)
ax01.set_title('Item price distribution', fontsize=20)
ax02.set_title('Item count distribution', fontsize=20)
sns.histplot(data = df ,x="Sales_per_item", kde=True, bins=50,ax=ax00, color="violet")
sns.histplot(data = df ,x="item_price", kde=True, ax=ax01, bins=50, color="tomato")
sns.histplot(data = df ,x="item_cnt_day", kde=False, ax=ax02, bins=20, color="cornflowerblue")

fig.subplots_adjust(top=0.8)
fig.suptitle('Sales Feature Distributions per Day', fontsize="28");

Observation:
- We are looking at sales figure on daily basis. It is different from our target distribution
- In most of the days only 1 item is sold.
- We are also getting a clear idea on price range(after removing top 5% items.)

Analysis On Shop_id, item_id and item_category_id

Question:
* Do all shop id present in test data present in train data and viceversa?
* Do all item id present in test data present in train data and viceversa?
* Do all shop id - item id pairs present in test data present in train data and viceversa?

In [None]:
train_shop_ids = set(df['shop_id'].unique())
test_shop_ids = set(df_test['shop_id'].unique())

train_item_ids = set(df['item_id'].unique())
test_item_ids = set(df_test['item_id'].unique())

print(f'There are about {len(train_shop_ids)} unique shop ids in train data and {len(test_shop_ids)} shop ids in test data')
print(f'There are about {len(train_item_ids)} unique item ids in train data and {len(test_item_ids)} item ids in test data')
print('--'*50)

df['pair'] = df[['shop_id','item_id']].apply(lambda x: str(x['shop_id'])+'_'+str(x['item_id']),axis=1)
df_test['pair'] = df_test[['shop_id','item_id']].apply(lambda x: str(x['shop_id'])+'_'+str(x['item_id']),axis=1)
train_pair_ids = set(df['pair'].unique())
test_pair_ids = set(df_test['pair'].unique())

print(f'There are {len(train_shop_ids - test_shop_ids)} shop ids present in train data which are not in test data')
print(f'There are {len(train_item_ids - test_item_ids)} item ids present in train data which are not in test data')
print(f'There are {len(train_pair_ids - test_pair_ids)} shop id item id pairs present in train data which are not in test data')

print('--'*50)

print(f'There are {len(test_item_ids - train_item_ids)} item ids present in test data which are not in train data')
print(f'There are {len(test_shop_ids - train_shop_ids)} shop ids present in test data which are not in train data')
print(f'There are {len(test_pair_ids - train_pair_ids)} shop id item id pairs present in test data which are not in train data')

Observation:
- We have about 60 unique shop ids 21807 unique item ids present in train data
- It is important to note that about 363 item ids present in test data is not present in train data. So we have approximatly 363*60(no of unique shop ids)=102796 pairs which are note present in train data. So we can say that our model needs to be robust to capture this unseen patterns.

Question:
Which shop id, item id and item category id have maximum sales?

In [None]:
print(f'Total number of unique shop ids: {df["shop_id"].nunique()}')
df_tmp = df[["shop_id","Sales_per_item","item_cnt_day"]]
df_tmp= pd.pivot_table(data=df_tmp,index=["shop_id"],aggfunc={"item_cnt_day":np.sum,"Sales_per_item":np.sum}).reset_index()


fig, axes = plt.subplots(2,1,figsize=(20,10))
sns.barplot(x=df_tmp["shop_id"],y=df_tmp["item_cnt_day"],ax=axes[0])
axes[0].set_title("Total number of units sold among various shops")
sns.barplot(x=df_tmp["shop_id"],y=df_tmp["Sales_per_item"],ax=axes[1])
axes[1].set_title('Total turn over in various shops')
plt.suptitle('Shop id', fontsize="28")
plt.show()

In [None]:
print(f'Total number of unique item ids: {df["item_id"].nunique()}')

df_tmp = df[["item_id","Sales_per_item","item_cnt_day"]]
df_tmp= pd.pivot_table(data=df_tmp,index=["item_id"],aggfunc={"item_cnt_day":np.sum,"Sales_per_item":np.sum}).reset_index()
df_tmp_sales = df_tmp.sort_values(by=['Sales_per_item'],ascending=False).head(50).reset_index(drop=True)
df_tmp_count = df_tmp.sort_values(by=['item_cnt_day'],ascending=False).head(50).reset_index(drop=True)

fig, axes = plt.subplots(2,1,figsize=(20,15))
sns.barplot(x=df_tmp_count["item_id"],y=df_tmp_count["item_cnt_day"],ax=axes[0])
axes[0].set_title("Top selling items of no of units sold")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=45)
sns.barplot(x=df_tmp_sales["item_id"],y=df_tmp_sales["Sales_per_item"],ax=axes[1])
axes[1].set_title('Top selling items in terms of Turn over')
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=45)
plt.suptitle('Item id', fontsize="28")
plt.show()
# del df_tmp,df_tmp_count,df_tmp_sales

In [None]:
print(f'Total number of unique item categorical ids: {df["item_category_id"].nunique()}')

df_tmp = df[["item_category_id","Sales_per_item","item_cnt_day"]]
df_tmp= pd.pivot_table(data=df_tmp,index=["item_category_id"],aggfunc={"item_cnt_day":np.sum,"Sales_per_item":np.sum}).reset_index()
df_tmp_sales = df_tmp.sort_values(by=['Sales_per_item'],ascending=False).head(50).reset_index(drop=True)
df_tmp_count = df_tmp.sort_values(by=['item_cnt_day'],ascending=False).head(50).reset_index(drop=True)

fig, axes = plt.subplots(2,1,figsize=(20,15))
sns.barplot(x=df_tmp_count["item_category_id"],y=df_tmp_count["item_cnt_day"],ax=axes[0])
axes[0].set_title("Top selling items of no of units sold")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=45)
sns.barplot(x=df_tmp_sales["item_category_id"],y=df_tmp_sales["Sales_per_item"],ax=axes[1])
axes[1].set_title('Top selling items in terms of Turn over')
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=45)
plt.suptitle('Item Categorical id', fontsize="28")
plt.show()
del df_tmp,df_tmp_count,df_tmp_sales

Observation:
- We have maxium units sold as well as maxiumum sales in shop id 31.
- We have maximum units sold for item 20949. But maximum sales in terms of Turn over is for item number 3732. Its more over like selling of chocolate and television.
- Similarly for item categories, we have maximum number of unit sold for item_category_id 40 while maximum sales in terms of turn over occured to category id 19.

Question:
Are there any items which are more than in one category.ie Do an item belongs to single category or not?

In [None]:
item_categories = df['item_category_id'].unique()
tmp = df[['item_id','item_category_id']].groupby(by="item_id").nunique().reset_index()
tmp.head()

In [None]:
tmp['item_category_id'].unique()

Observation:
- We can see that there is only 1 category corresponding to a item_id

Question:
What about Monthly Sales?

In [None]:
#creating some new features
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month
df["day_of_month"] = df["date"].dt.day
df["day_of_week"] = df["date"].dt.day_of_week

In [None]:
fig,axes = plt.subplots(2,1,figsize=(25,12))
df_tmp = df[['date_block_num','Month','Sales_per_item']].groupby(by=['date_block_num']).aggregate("sum").reset_index()
sns.lineplot(x=df_tmp['date_block_num'],y=df_tmp['Sales_per_item'],ax=axes[0])
axes[0].set_title('Total turn over (Total Sales)',fontsize="25")
axes[0].set_xlabel('Date',fontsize="20")
axes[0].set_ylabel('Turn over per month',fontsize="20")


df_tmp = df[['date_block_num','Month','item_cnt_day']].groupby(by=['date_block_num']).aggregate("sum").reset_index()
sns.lineplot(x=df_tmp['date_block_num'],y=df_tmp['item_cnt_day'],ax=axes[1])
axes[1].set_title('Total units sold',fontsize="25")
axes[1].set_xlabel('Date',fontsize="20")
axes[1].set_ylabel('Turn over per month',fontsize="20")

plt.tight_layout()
del df_tmp
plt.show()

In [None]:
fig,axes = plt.subplots(1,2,figsize=(25,8))
df_tmp = df[['Year','Month','Sales_per_item']].pivot_table(index=['Month'],columns=['Year'],aggfunc={"Sales_per_item":np.sum})
axes[0].plot(df_tmp)
axes[0].set_title('Total turn over (Total Sales)')
axes[0].legend(labels=[i[1] for i in df_tmp.columns])

df_tmp = df[['Year','Month','item_cnt_day']].pivot_table(index=['Month'],columns=['Year'],aggfunc={"item_cnt_day":np.sum})
axes[1].plot(df_tmp)
axes[1].set_title('Total no of units sold')
axes[1].legend(labels=[i[1] for i in df_tmp.columns])
plt.suptitle('Monthly Sales - Yearly',fontsize="28")
plt.show()

Observation:
* We can see that number of products sold as well as total turnover over the months follow almost same pattern.
* We have maximum sales in the month of november and december.
* Since we are predicting sales for november 2015 , we expect a raise in no of units sold

Question:
* Do all the shops sold items on all months from 2013 Jan to 2015 Oct?
* Do some shops is deactive on mean time and become active again?

In [None]:
df_tmp = df[['date_block_num','shop_id','item_cnt_day']]
df_tmp.groupby(by='date_block_num').aggregate({'shop_id':'nunique'}).reset_index()

Let us take a sample of shops and plot the number of total items sold on each month. This gives an idea about the status of the shop in that month

In [None]:
df_tmp = df[['date_block_num','shop_id','item_cnt_day']]
dt = pd.pivot_table(index='date_block_num',data=df_tmp,columns='shop_id',aggfunc="sum").reset_index(drop=True)
dt = dt.item_cnt_day
dt.columns.name = 'Month'
dt

Observation:
- We have a total of 60 shops.But all of them are note active during all the months
- For example in case of shop id 9 , it is active only in 9th 21th and 33th month. Shop id 0 and shop id 1 is only active in first two months. Shop id 52 is active on all the months.(From above pivot table) - We can see that some of the shops are not active in certain months and again they become active - On feature engineering we can create lag average of sales on shops as a feature.

# 5. Modeling

In [None]:
def remove_outliers(df):
    #remove outliers from item_cnt_day
    df = df[df['item_cnt_day'] >= 0]
    upper_quantile = df['item_cnt_day'].quantile(0.95)
    df['item_cnt_day'] = np.where(df['item_cnt_day'] > upper_quantile, upper_quantile, df['item_cnt_day'])
    
    df = df[df['item_price'] >= 0]
    upper_quantile = df['item_price'].quantile(0.95)
    df['item_price'] = np.where(df['item_price'] > upper_quantile, upper_quantile, df['item_price'])
    return df

In [None]:
#load the merged data
df_train = pd.read_csv('merged_original.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
print(df_train.shape,df_test.shape)

In [None]:
#removing outliers as per our analysis
df_train = remove_outliers(df_train)
df_train.shape

In [None]:
#clippping values as per kaggle discussion
df_train['item_cnt_day'] = df_train['item_cnt_day'].clip(0,20)

In [None]:
# creating train data
df = df_train[['date_block_num','shop_id','item_id','item_cnt_day']]
data = pd.pivot_table(data=df,index=['shop_id','item_id'],columns=['date_block_num'],fill_value=0,values='item_cnt_day',aggfunc="sum")
data = data.reset_index()
data.columns.name = None
print(df.shape)
print(data.shape)
data.head(4)

In [None]:
# Generating test data
#merge
test_data = df_test.drop(columns=['ID'])
test_data = test_data.merge(data,on=['shop_id','item_id'],how="left")
test_data.fillna(0,inplace=True)
test_data = test_data.drop(columns=['shop_id','item_id'])
test_data.head()

## Train validation Split
We must be cautious while doing a train validation split. We have data from 2013 Jan to 2015 October and we need to predict Sales of month November.ie we have data of 34 months and we need to predict sales of 35th month.

- Here we will take data from 1st month to 32nd month as train data (and we will predict 33nd month) 
- We will take data from 2nd month to 33nd month as validation data (and we will predict 34nd month) 
- Finally we will have data from 3rd to 34th month as test data

In [None]:
# train_data -> columns 0 to 32
X_train = data.drop(columns=['shop_id','item_id',33,32],axis=1)
y_train = data[32]

# val_data -> columns 1 to 32
X_val = data.drop(columns=['shop_id','item_id',0,33],axis=1)
y_val = data[33]

#test data-> columns 2 to 33
X_test = test_data.drop(columns=[0,1])

print(X_train.shape,X_val.shape,X_test.shape)

## Checking Coorelation Among Train Features

In [None]:
df_corr = X_train.corr()
plt.figure(figsize=(20,20))

sns.heatmap(df_corr,annot=True)
plt.show()

Observation:
- We donot found any strong coorelated features

### Baseline models
### Random Forest
As of now we have enough information for going towards modeling. Let us create a baseline model to work with.We will go with a Random Forest model.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Hyper parameter tuning and modeling

In [None]:
# from itertools import product
# # params
# max_depth = [2,5,10]
# n_estimators = [50,100,150,300]
# min_samples_split = [2,4]

# params =  [max_depth,n_estimators,min_samples_split]
# parameters = list(product(*params))
# min_rmse = float('inf')
# best_params = parameters[0]
# for p in parameters:
#     depth = p[0]
#     estimators = p[1]
#     min_sample_split = p[2]
    
#     print(f"Fitting params -> max_depth: {depth},n_estimators: {estimators} , min_samples_split:{min_sample_split}")
#     model = RandomForestRegressor(random_state=42,max_depth=depth,n_estimators=estimators,min_samples_split=min_sample_split)
#     model.fit(X_train,y_train)
#     y_train_pred = model.predict(X_train)
#     y_val_pred = model.predict(X_val)
    
#     train_rmse = mean_squared_error(y_train,y_train_pred,squared=False)
#     val_rmse = mean_squared_error(y_val,y_val_pred,squared=False)
    
#     print(f'Train rmse: {train_rmse}')
#     print(f'Val rmse: {val_rmse}')
    
#     if val_rmse < min_rmse:
#         min_rmse = val_rmse
#         best_params = p
        
#     print('--'*50)

    
# print(f'Found following best parameters: max_depth: {best_params[0]},\
# n_estimators: {best_params[1]} , min_samples_split:{best_params[2]} with validation loss {min_rmse}')

In [None]:
#model
# parameters found by grid search


model = RandomForestRegressor(random_state=42,max_depth=5,n_estimators=50,min_samples_split=4)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

train_rmse = mean_squared_error(y_train,y_train_pred,squared=False)
val_rmse = mean_squared_error(y_val,y_val_pred,squared=False)

print(f'Train rmse: {train_rmse}')
print(f'Val rmse: {val_rmse}')

## Xgboost Model

In [None]:
import xgboost as xgb

Hyperparameter tuning (Random Search)

In [None]:
# import xgboost as xgb
# from itertools import product
# import random

# #params
# booster = 'gbtree'
# silent = 0

# eta = [0.01,0.1,0.2,0.3]  #learning rate in gbm
# min_child_weight = [1,2,0.5]  #minimum sum of weights of all observations required in a child
# max_depth = [3,6,8]  #maximum depth of the tree
# gamma = [0,1]  #a split will occur only when reduction in loss function > gamma 
# subsample = [0.5,0.7,1] #fraction of rows to be sampled to make a decision tree
# colsample_bytree = [0.5,0.7,1] #fraction of columns to be sampled to make a decision tree
# # lambda_ = [0.4,0.8,1] #L2 regularization weights
# alpha = [0,1]  #L1 reg weights



# params =  [eta,min_child_weight,max_depth,gamma,subsample,colsample_bytree,alpha]
# parameters = list(product(*params))
# parameters = random.sample(parameters,100)
# len(parameters)


# min_rmse = float('inf')
# best_params = parameters[0]
# for p in parameters:
#     eta = p[0]
#     min_child_weight = p[1]
#     max_depth = p[2]
#     gamma = p[3]
#     subsample = p[4]
#     colsample_bytree = p[5]
#     alpha = p[6]

#     print('Random Search On Hyperparamters Xgboost')
#     print(f"Fitting params -> eta: {eta},\
# min_child_weight: {min_child_weight} , max_depth:{max_depth}, gamma: {gamma}, subsample : {subsample}, \
# col_sample_bytree: {colsample_bytree:},alpha: {alpha}")
#     model = xgb.XGBRegressor(random_state=42,booster = 'gbtree',verbosity=0,
#                              eta=eta,min_child_weight=min_child_weight,max_depth=max_depth,gamma=gamma,
#                              subsample=subsample,colsample_bytree=colsample_bytree,alpha=alpha)
    
#     model.fit(X_train,y_train)
#     y_train_pred = model.predict(X_train)
#     y_val_pred = model.predict(X_val)
    
#     train_rmse = mean_squared_error(y_train,y_train_pred,squared=False)
#     val_rmse = mean_squared_error(y_val,y_val_pred,squared=False)
    
#     print(f'Train rmse: {train_rmse}')
#     print(f'Val rmse: {val_rmse}')
    
#     if val_rmse < min_rmse:
#         min_rmse = val_rmse
#         best_params = p
        
#     print('--'*50)

    
# print(f'Found following best parameters: eta: {best_params[0]},\
# min_child_weight: {best_params[1]} , max_depth:{best_params[2]}, gamma: {best_params[3]}, subsample : {best_params[4]}, \
# col_sample_bytree: {best_params[5]},alpha: {best_params[6]} with validation loss {min_rmse}')

In [None]:
eta = 0.01
min_child_weight = 2
max_depth = 8
gamma = 0
subsample  = 1
colsample_bytree = 0.5
alpha = 1

model = xgb.XGBRegressor(random_state=42,booster = 'gbtree',verbosity=0,
                             eta=eta,min_child_weight=min_child_weight,max_depth=max_depth,gamma=gamma,
                             subsample=subsample,colsample_bytree=colsample_bytree,alpha=alpha)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
    
train_rmse = mean_squared_error(y_train,y_train_pred,squared=False)
val_rmse = mean_squared_error(y_val,y_val_pred,squared=False)
    
print(f'Train rmse: {train_rmse}')
print(f'Val rmse: {val_rmse}')

# Submission

In [None]:
submission = pd.DataFrame({'ID':df_test['ID'],'item_cnt_month':y_test_pred})
submission.to_csv('submission.csv',index=False)
submission.head(10)