## 01: Imports  & Data Load: 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, explained_variance_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# loading data:
PATH = "/kaggle/input/competitive-data-science-predict-future-sales/"
items_df = pd.read_csv(PATH+ "items.csv")
item_categories_df = pd.read_csv(PATH+ "item_categories.csv")
shops_df = pd.read_csv(PATH+ "shops.csv")
train_df = pd.read_csv(PATH+ "sales_train.csv")
test_df = pd.read_csv(PATH+"test.csv")

## 02. First Look: 

Make a copy of data:

In [None]:
eda_df = train_df.copy(deep=True)

Take a look few rows: 

In [None]:
train_df.head()

In [None]:
items_df.head()

In [None]:
item_categories_df.head()

In [None]:
shops_df.head()

In [None]:
test_df.head()

In [None]:
super_set = train_df.merge(items_df,left_on='item_id', right_on='item_id')
super_set = super_set.merge(item_categories_df, left_on='item_category_id', right_on='item_category_id')
super_set = super_set.merge(shops_df, left_on='shop_id', right_on='shop_id')
super_set

In [None]:
super_set_test = test_df.merge(items_df, left_on='item_id', right_on='item_id')
super_set_test = super_set_test.merge(item_categories_df, left_on='item_category_id', right_on='item_category_id')
super_set_test = super_set_test.merge(shops_df, left_on='shop_id', right_on='shop_id')
super_set_test

-----------------------------------------------------------

Five Number Summary: 

In [None]:
train_df.describe()

__Comments:__

i. `item_cnt_day` & `item_price` have extreme values - probably outliers. 

ii. `item_price` has negative value which doesn't make sense -- probably incorrect data.

iii. `item_cnt_day` is negative - suggest return of item. - will remove these entries. 

Negative `Item_price` :

In [None]:
eda_df[eda_df['item_price'] <= 0 ]

__Comments__:

i. Only one such product (item price < 0) - remove it. 

In [None]:
eda_df = eda_df[eda_df['item_price'] > 0]

Extreme `Item Price`: 

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_price', data=eda_df,ax=ax[0])
ax[1].hist(eda_df['item_price'])
ax[0].set_title('Item_price - Boxplot')
ax[1].set_title('Item_price - Histogram')
plt.show()

Extreme Values `Item_cnt_day`: 

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_cnt_day', data=eda_df,ax=ax[0])
ax[1].hist(eda_df['item_cnt_day'])
ax[0].set_title('item_cnt_day - Boxplot')
ax[1].set_title('item_cnt_day - Histogram')
plt.show()

Remove `return item` (`item_cnt_day` < 0 )  

In [None]:
eda_df[eda_df['item_cnt_day'] < 0].shape

In [None]:
eda_df[eda_df['item_cnt_day'] == 0].shape

In [None]:
eda_df = eda_df[eda_df['item_cnt_day'] > 0]

In [None]:
eda_df.shape

Setting 99% percentile to extreme values: 

In [None]:
columns = ['item_cnt_day','item_price']
for col in columns:
    # take 95 percentile # 
    percentile = eda_df[col].quantile(0.99)
    print(col,percentile)
    # replace where match # 
    eda_df.loc[eda_df[col] > percentile, col] = percentile

Checking Now.

In [None]:
eda_df.describe()

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_cnt_day', data=eda_df,ax=ax[0])
ax[1].hist(eda_df['item_cnt_day'])
ax[0].set_title('item_cnt_day - Boxplot')
ax[1].set_title('item_cnt_day - Histogram')
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_price', data=eda_df,ax=ax[0])
ax[1].hist(eda_df['item_price'])
ax[0].set_title('item_price - Boxplot')
ax[1].set_title('item_price - Histogram')
plt.show()

Datatypes: 

In [None]:
train_df.info()

Convert Dataypes: 

In [None]:
eda_df['date'] = pd.to_datetime(eda_df['date'])

In [None]:
eda_df['date'].head()

In [None]:
eda_df['Month'] = eda_df['date'].dt.month
eda_df['Year'] = eda_df['date'].dt.year

Missing Values: 

In [None]:
train_df.isnull().sum()

Basic Statistics: 

i. No of shops: 

In [None]:
print('Total Shops - Training: ',len(train_df['shop_id'].unique()))
print('Total Shops - testing: ',len(test_df['shop_id'].unique()))

ii. Mismatch of shops between train & test dataset: 

In [None]:
test_df[~test_df['shop_id'].isin(train_df['shop_id'].unique().tolist())]

__comments__:

- Shops training data also appearing in test data - there is no surprise in case of shops.

iii. No of Products: 

In [None]:
print('Total Products - Training: ',len(train_df['item_id'].unique()))
print('Total Products - Testing:',len(test_df['item_id'].unique()))

iv. Mismatch of items between test & train data: 

In [None]:
len(test_df[~test_df['item_id'].isin(train_df['item_id'].unique().tolist())]['item_id'].unique().tolist())

__Comments:__

- 363 products not part of training data but appear in test data. 

v. No of Item Categories: 

In [None]:
print('Total item categories - Training: ',len(super_set['item_category_id'].unique()))
print('Total item categories - Testing: ',len(super_set_test['item_category_id'].unique()))

vi. Mismatch of items category between test & train data: 

In [None]:
train_df

In [None]:
len(super_set_test[~super_set_test['item_category_id'].isin(super_set['item_category_id'].unique().tolist())]['item_category_id'].unique().tolist())

__Comments:__

- All item categories of train data appear in test data - no surprise  

vii. Will price of each product will vary based on shop id?

Take few most frequent products: 

In [None]:
frq_prod = train_df['item_id'].value_counts().head().index.tolist()
frq_prod

Sample of Shops:

In [None]:
shops = train_df['shop_id'].sample(10).unique().tolist()
shops

In [None]:
sample_data = eda_df[(eda_df['shop_id'].isin(shops)) & (eda_df['item_id'].isin(frq_prod)) ]

In [None]:
item_price_stats = sample_data.groupby(by=['item_id', 'shop_id'])['item_price'].agg([
    'sum', 'min','max','mean','median','std']).reset_index()
item_price_stats.head()

In [None]:
metrics = ['mean', 'median','sum', 'min', 'max', 'std']

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
for i in range(0,6):
    sns.barplot(x='item_id', y=metrics[i],hue='shop_id', data=item_price_stats,ax=ax[0 if i <= 2 else 1,i if i <= 2 else i - 3 ])
    ax[0 if i <= 2 else 1,i if i <= 2 else i - 3 ].set_title('Same product on different shops --'+metrics[i])
plt.show()

__Comments__:

i. `item_price` for same `item_id` vary based on `shop_id`. 

ii. In fact on each shop there is a variation within price of same product. - Maybe in price vary based on month -- Lets check that. 

iii. Maximum price sugest that same product was sold as low as somewhere between 3,000 and high as 6,000 -- Seems like a bit of variation in price.

Lets convert daily sales into monthly sales:

In [None]:
eda_df['month'] = pd.to_datetime(eda_df['date']).dt.month
eda_df['year'] = pd.to_datetime(eda_df['date']).dt.year

In [None]:
agg_train_df = eda_df.groupby(by=['month','year','shop_id', 'item_id']).agg({'item_price':'mean', 'item_cnt_day':'sum'}).reset_index()

In [None]:
agg_train_df

In [None]:
agg_train_df.describe()

i. Distribution of item price: 

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_price', data=agg_train_df,ax=ax[0])
ax[1].hist(eda_df['item_price'])
ax[0].set_title('item_price - Boxplot')
ax[1].set_title('item_price - Histogram')
plt.show()

ii. Distribution of items sold on monthly basis:

In [None]:
fig, ax = plt.subplots(ncols=2,figsize=(20,5))
sns.boxplot(x='item_cnt_day', data=agg_train_df,ax=ax[0])
ax[1].hist(eda_df['item_cnt_day'])
ax[0].set_title('item_cnt_day - Boxplot')
ax[1].set_title('item_cnt_day - Histogram')
plt.show()

iii. Variation in price of products: 

In [None]:
agg_train_df.groupby(by=['item_id']).agg({'item_price':'std'}).fillna(0).plot(figsize=(10,5),title='Variation in price')
plt.show()

iii. Variation in sales of products: 

In [None]:
agg_train_df.groupby(by=['item_id']).agg({'item_cnt_day':'std'}).fillna(0).plot(figsize=(10,5),title='Variation in sales')
plt.show()

In [None]:
temp = agg_train_df.groupby(by=['item_id']).agg({'item_cnt_day':'std','item_price':'std'}).fillna(0)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(temp)
temp_scaled = scaler.transform(temp)

In [None]:
pd.DataFrame(temp_scaled).corr()

__Learning__: 

- Correlation is independent of scale so transformation before correlation doesn't help.

__Comments__: 
    
- Variation in price is not impacting significantly to sales of the product. 

iv. Adding shops: 

In [None]:
temp = agg_train_df.groupby(by=['item_id','shop_id']).agg({'item_cnt_day':'std','item_price':'std'}).fillna(0)

In [None]:
temp

In [None]:
temp.corr()

__Comments__: 

- Shops have varying prices for various products which in result impact sales. Slightly improved correlation when added shop id in to the data. 

v. Adding timeline: 

In [None]:
temp = agg_train_df.groupby(by=['item_id','shop_id','month']).agg({'item_cnt_day':'std','item_price':'std'}).fillna(0)

In [None]:
temp.corr()

Yearly Trend: 

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
agg = agg_train_df.groupby(by=['year'])['item_cnt_day'].sum().reset_index()
sns.barplot(x='year', y='item_cnt_day', data=agg)
plt.title('Yearly Trend')
plt.show()

Monthly Trend: 

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
agg = agg_train_df.groupby(by=['month'])['item_cnt_day'].sum().reset_index()
sns.barplot(x='month', y='item_cnt_day', data=agg)
plt.title('month Trend')
plt.show()

Year-Monthly Trend: 

In [None]:
temp = agg_train_df.copy(deep=True)
temp['year_month'] = temp[['year','month']].apply(lambda x: str(x.year)+'-'+str(x.month),axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
agg = temp.groupby(by=['year_month'])['item_cnt_day'].sum().reset_index()
agg['year'] = agg['year_month'].apply(lambda x: int(x.split('-')[0]))
agg['month'] = agg['year_month'].apply(lambda x: int(x.split('-')[1]))
agg = agg.sort_values(by=['year','month']).reset_index(drop=True)
sns.barplot(x='year_month', y='item_cnt_day', data=agg)
plt.title('Year-Month Trend')
plt.xticks(rotation=90)
plt.show()

Mean Price of Products: 

In [None]:
agg = agg_train_df.groupby(by=['year'])['item_price'].mean().reset_index()
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x='year', y='item_price', data=agg)
plt.title('Year-Month Trend')
plt.ylabel('Average Price of Item')
plt.xticks(rotation=90)
plt.show()

In [None]:
agg_train_df.groupby(by=['year']).agg({'item_cnt_day':'sum', 'item_price':'mean'}).corr()

__Comments__:

- Price of product seems to have linear relation with sales of products. 

------------------

## 03. Deep Dive: 

#### 01. Shop Analysis: 

i. Shops Sales Trend: 

In [None]:
agg_sales = eda_df.groupby(by=['shop_id'])['item_cnt_day'].sum().reset_index().sort_values(by=['item_cnt_day'],
                                                                                     ascending=False)
fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(x='shop_id', y='item_cnt_day', data=agg_sales,order=agg_sales.index)
ax.set_title('Sales of Shops')
plt.show()

ii. Having more products translate to more sales? 

In [None]:
agg_items = eda_df.groupby(by=['shop_id'])['item_id'].unique().apply(lambda x: len(x)).reset_index().rename(columns=
                                {"item_id":'Total Items'}).reset_index().sort_values(by=['Total Items'],
                                                                                      ascending=False)

In [None]:
#Create combo chart
fig, ax1 = plt.subplots(figsize=(20,6))
color = 'tab:green'
#bar plot creation
ax1.set_title('Total Items Vs Sales of Shop', fontsize=16)
ax1.set_xlabel('shop_id', fontsize=16)
ax1.set_ylabel('Sales', fontsize=16)
ax1 = sns.lineplot(x='shop_id', y='item_cnt_day', data=agg_sales, palette='summer')
ax1.tick_params(axis='y')
plt.legend(['Sales'])
#specify we want to share the same x-axis
ax2 = ax1.twinx()
color = 'tab:red'
#line plot creation
ax2.set_ylabel('Total Items', fontsize=16)
ax2 = sns.lineplot(x='shop_id', y='Total Items', data = agg_items, color=color)
ax2.tick_params(axis='y', color=color)
#show plot
plt.legend(['Total Items'])
plt.show()

__Comments__: 

- Seems like having more items translate to more sales. 

iii. How variation in price impact sales of product? 

In [None]:
agg_train_df.groupby(by=['shop_id']).agg({'item_cnt_day':'sum','item_price':'std'}).fillna(0).corr()

__Comments__:

- Sales negatively impact by variation in price but that impact quite minimal. 

iv. Having a lower average value of product on specific shop can really increase sales?

In [None]:
agg_train_df.groupby(by=['shop_id']).agg({'item_cnt_day':'sum','item_price':'mean'}).fillna(0).corr()

__Comments__:

- Sales have inverse relation with average value of product on a specific shop - if average product price low it could attract more people hence more sales. 

v. Having a variety of product on each shop can really contribute to more sales? 

In [None]:
temp = agg_train_df.groupby(by=['shop_id']).agg({'item_cnt_day':'mean'}).reset_index()
temp = temp.merge(agg_items, left_on='shop_id', right_on='shop_id')
temp = temp[['item_cnt_day','Total Items']]

In [None]:
temp.corr()

__Comments__:

- Seesm like having more items can actually increase sales. 

#### 02. Item Category Analysis: 

In [None]:
super_set['item_category_name'].value_counts()

In [None]:
def spliter(x):
    try:
        return x.split(' - ')[1]
    except:
        return ''

In [None]:
super_set['item_short_category'] = super_set['item_category_name'].apply(spliter)

Mapping Item Category to our monthly sales dataset: 

In [None]:
mapping_categories = super_set[['item_id','item_short_category']]

In [None]:
mapping_categories = mapping_categories.drop_duplicates()

In [None]:
agg_train_df = agg_train_df.merge(mapping_categories, left_on='item_id', right_on='item_id')

i. Whether there is any relation between item category & its sales? 

In [None]:
agg_train_df.head(2)

In [None]:
fig, ax = plt.subplots(figsize=(10,15))
sns.boxplot(y='item_short_category', x='item_cnt_day', data=agg_train_df)

In [None]:
from scipy.stats import f_oneway

In [None]:
sample_categories = agg_train_df['item_short_category'].value_counts().head(5).index.tolist()
sample = agg_train_df[agg_train_df['item_short_category'].isin(sample_categories)]

In [None]:
cache = []
for category in sample_categories:
    cache.append(sample[sample['item_short_category']==category]['item_cnt_day'].values.tolist())

In [None]:
f_oneway(cache[0],cache[1],cache[2],cache[3],cache[4])

__Comments__: 

- Took 5 categories as sample to run an anova test. It seems like category of the item has some relation to the sales. - Not sure if its positive or negative. 

In [None]:
agg_train_df

## 04. Data Prep For Model: 

i. Price variance of specific product on specific shop: 

In [None]:
prod_shop_variance = agg_train_df.groupby(by=['shop_id','item_id']).agg({'item_price':'std'}).fillna(0).reset_index()
prod_shop_variance['Key'] = prod_shop_variance[['shop_id','item_id']].apply(lambda x: str(x.shop_id)+'-'+str(x.item_id), axis=1)

In [None]:
prod_shop_variance = prod_shop_variance.rename(columns={'item_price':'price_variance'})
prod_shop_variance = prod_shop_variance[['Key','price_variance']]
prod_shop_variance.head()

ii. Average price of specific product on specific shop:

In [None]:
prod_shop_mean = agg_train_df.groupby(by=['shop_id','item_id']).agg({'item_price':'mean'}).fillna(0).reset_index()
prod_shop_mean['Key'] = prod_shop_mean[['shop_id','item_id']].apply(lambda x: str(x.shop_id)+'-'+str(x.item_id), axis=1)

In [None]:
prod_shop_mean.head(2)

In [None]:
prod_shop_mean = prod_shop_mean.rename(columns={'item_price':'price_mean'})
prod_shop_mean = prod_shop_mean[['Key','price_mean']]
prod_shop_mean.head()

iii. Total Items per shop: 

In [None]:
total_item_shop = agg_train_df.groupby(by=['shop_id'])['item_id'].unique().apply(lambda x: len(x)).reset_index().rename(
    columns={"item_id":'Total Items'})

In [None]:
total_item_shop.head(2)

iv. mapping data: 

In [None]:
agg_train_df_copy = agg_train_df.copy(deep=True)

In [None]:
agg_train_df.head(2)

In [None]:
agg_train_df['Key'] = agg_train_df[['shop_id','item_id']].apply(lambda x: str(x.shop_id)+'-'+str(x.item_id), axis=1)

In [None]:
agg_train_df = agg_train_df.merge(prod_shop_variance, left_on='Key', right_on='Key')
agg_train_df = agg_train_df.merge(prod_shop_mean, left_on='Key', right_on='Key')
agg_train_df = agg_train_df.merge(total_item_shop, left_on='shop_id', right_on='shop_id')


In [None]:
agg_train_df

In [None]:
agg_train_df.drop(['Key'],axis=1, inplace=True)
agg_train_df.drop(['item_short_category'],axis=1, inplace=True)

v. Average Price - Yearly basis: 

In [None]:
temp = agg_train_df.groupby(by=['year'])['item_price'].mean()

In [None]:
agg_train_df['yearly_avg_price'] = agg_train_df['year'].apply(lambda x: temp[x])

vi. Clipping data into decrete boundaries: 

In [None]:
le = LabelEncoder()

In [None]:
agg_train_df['price_variance'] = le.fit_transform(agg_train_df['price_variance'])
agg_train_df['price_mean'] = le.fit_transform(agg_train_df['price_mean'])
agg_train_df['item_price'] = le.fit_transform(agg_train_df['item_price'])
agg_train_df['Total Items'] = le.fit_transform(agg_train_df['Total Items'])

In [None]:
agg_train_df

In [None]:
agg_train_df['year'] = agg_train_df['year'].apply(lambda x: str(x)) 

In [None]:
data_for_model = pd.get_dummies(agg_train_df, drop_first=True)

In [None]:
data_for_model

In [None]:
X = data_for_model.drop(['item_cnt_day'], axis=1)
y = data_for_model['item_cnt_day']

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(X,y)

In [None]:
# scaler = StandardScaler()
# train_X = scaler.fit_transform(train_X)
# test_X = scaler.transform(test_X)

## 05. Model Training & Evaluation: 

In [None]:
train_X

In [None]:
models = [LinearRegression, RandomForestRegressor, XGBRegressor]
models_name = ['LinearRegression', 'RandomForestRegressor', 'XGBRegressor']
model_cache ={}

for i,model in enumerate(models): 
    
    # 01.  train model #
    ml_model = model()
    ml_model.fit(train_X, train_Y)
    model_cache[models_name[i]] = ml_model
    # 02. Predict # 
    predictions = ml_model.predict(test_X)
    # 03 Evaluate # 
    print('--------------------*---------------------------')
    print('Model: ', str(model).split('_')[-1])
    print('Mean Absolute Error: ',mean_absolute_error(test_Y, predictions))
    print('Explained Variance: ',explained_variance_score(test_Y, predictions))

## To be continue.