# Importing Packages

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mno

In [None]:
sns.set_theme(style="darkgrid")

# Dataset Analysis

## Reading CSV Dataset

In [None]:
df_sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv") 

In [None]:
df_items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")

In [None]:
df_item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")

In [None]:
df_shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")

In [None]:
df_item_cat_merge = pd.merge(df_items, df_item_cat, on='item_category_id', how='left')

In [None]:
df_train = pd.merge(df_item_cat_merge, df_sales, on='item_id', how='left')

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

In [None]:
df_test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
numerical_cols = df_train.columns[df_train.dtypes != object]

In [None]:
numerical_cols

In [None]:
categorical_cols = df_train.columns[df_train.dtypes == object]

In [None]:
categorical_cols

## Univariate Analysis

In [None]:
def create_hist_for_num_feature(feature):
    plt.figure( figsize=(10, 8))
    h = sns.histplot(x=feature, data=df_train)
    h.set_xlabel(feature,fontsize=15)
    h.set_ylabel("Count",fontsize=15)
    h.tick_params(labelsize=10)

In [None]:
create_hist_for_num_feature('shop_id')

- shop_id 31 is listed highest number of times in dataset



In [None]:
create_hist_for_num_feature('item_id')

- We can see that feew items are used many times than others (we can deduce this, because of few peaks present in graph)

In [None]:
np.set_printoptions(suppress=True) 
df_train.groupby('item_cnt_day')['item_id'].agg({'count'}).reset_index().sort_values('count', ascending=False)

## Bivariate Analysis

### Which category has more items?

In [None]:
categorized_items = df_train.groupby(['item_category_id', 'item_category_name'])['item_id'].agg({'count'}).reset_index().sort_values('count')

In [None]:
categorized_items

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='item_category_id', y='count', data=categorized_items)

- Item Category with id 51 has lowest item count (only 1).
- Item Category with id 40 has highest item count (564652)

### Which category has costliest item?

We will find costliest item of each category.

In [None]:
df_train.groupby(['item_category_id', 'item_id'])['item_price'].agg({'max'}).reset_index().sort_values('max', ascending = False)

- Item Category id 75 has costliest item which is priced at 307980 with id as 6066. 

### Which category has more costly items?

In [None]:
categorized_price = df_train.groupby(['item_category_id'])['item_price'].agg({'sum'}).reset_index().round(2)

In [None]:
categorized_price

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='item_category_id', y='sum', data=categorized_price)

- Item category id 19 has items with more price.

### Which item is costing more?

In [None]:
df_train[['item_id','item_name', 'item_price']].sort_values('item_price', ascending=False)

### Which shops sells more unique items?

In [None]:
shop_items = df_train[['shop_id', 'item_id']].groupby(['shop_id', 'item_id']).count().reset_index()

In [None]:
shop_items

In [None]:
shop_unique_item_count = shop_items.groupby('shop_id').agg(unique_item_id_count = ('item_id', 'count')).reset_index().sort_values('unique_item_id_count', ascending=False)

In [None]:
shop_unique_item_count

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='shop_id', y='unique_item_id_count', data=shop_unique_item_count)

- Shop Id 25 sells highest number of unique items.
- Shop Id 36 sells least number of unique items.

### Which shop has more unique category of items?

In [None]:
shop_category = df_train.groupby(['shop_id', 'item_category_id']).count().reset_index()

In [None]:
shop_category

In [None]:
shop_unique_item_category = shop_category.groupby('shop_id').agg(unique_category_cnt = ('item_category_id', 'count')).reset_index().sort_values('unique_category_cnt', ascending=False)

In [None]:
shop_unique_item_category

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='shop_id', y='unique_category_cnt', data=shop_unique_item_category)

- Shop Id 25 has highest number of unique category.
- Shop Id 55 has least number of unique category.

### Which shops earns more?

In [None]:
shop_earning = df_train[['shop_id', 'item_price']].groupby(['shop_id']).agg(total_item_price = ('item_price', 'sum')).reset_index().sort_values('total_item_price', ascending=False)

In [None]:
shop_earning

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='shop_id', y='total_item_price', data=shop_earning)

- shop id 31 has highest earning.
- Shop Id 36 has lowest earning.

### Which shops sells more items per day?

In [None]:
shop_item_cnt = df_train[['shop_id', 'item_cnt_day']].groupby(['shop_id']).agg(item_cnt = ('item_cnt_day', 'sum')).reset_index().sort_values('item_cnt', ascending=False)

In [None]:
shop_item_cnt

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='shop_id', y='item_cnt', data=shop_item_cnt)

- Shop Id 31 sells highest number of item per day.
- Shop Id 36 sells lowest number of item per day,

### Which item is sold more per day?

In [None]:
df_train[['item_id', 'item_name', 'item_cnt_day']].sort_values('item_cnt_day', ascending=False)

Item Id 11373 was sold highest number of times per day.

### Which category of item is sold more per day?

In [None]:
category_cnt = df_train[['item_category_id', 'item_cnt_day']].groupby(['item_category_id']).agg(cnt_sum = ('item_cnt_day', 'sum')).reset_index().sort_values('cnt_sum', ascending=False)

In [None]:
category_cnt

In [None]:
plt.figure(figsize=(30, 10))
sns.barplot(x='item_category_id', y= 'cnt_sum', data= category_cnt)

- Item Category Id 40 was sold more per day.
- Item Category Id 51 was sold least per day.

In [None]:
pd.to_datetime(df_train['date']).dt.month

In [None]:
# We will extract the year, month and day from date column to perform monthly and yearly analysis.
def extract_date_details_into_columns():
    dataset = df_train.copy()
    dataset['year'] = pd.to_datetime(df_train['date']).dt.year
    dataset['month'] = pd.to_datetime(df_train['date']).dt.month
    dataset['day'] = pd.to_datetime(df_train['date']).dt.day
    
    return dataset

In [None]:
df_train_new = extract_date_details_into_columns()

### Which month has more sale?

In [None]:
monthly_sale_df =  df_train_new[['month', 'item_cnt_day']].groupby('month').agg( sale_per_month = ('item_cnt_day', 'sum') ).reset_index().sort_values('sale_per_month', ascending=False)

In [None]:
monthly_sale_df

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x='month', y='sale_per_month', data=monthly_sale_df)

- December has highest sale.

### Which month generates more revenue?

In [None]:
monthly_rev_df =  df_train_new[['month', 'item_price']].groupby('month').agg( rev_per_month = ('item_price', 'sum') ).reset_index().sort_values('rev_per_month', ascending=False)

In [None]:
monthly_rev_df

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x='month', y='rev_per_month', data=monthly_rev_df)

### Which year has more sale?

In [None]:
yearly_sale_df =  df_train_new[['year', 'item_cnt_day']].groupby('year').agg( sale_per_year = ('item_cnt_day', 'sum') ).reset_index().sort_values('sale_per_year', ascending=False)

In [None]:
yearly_sale_df

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x='year', y='sale_per_year', data=yearly_sale_df)

Year 2013 has highest sale.

### Which year is more profitable?

In [None]:
yearly_rev_df =  df_train_new[['year', 'item_price']].groupby('year').agg( rev_per_year = ('item_price', 'sum') ).reset_index().sort_values('rev_per_year', ascending=False)

In [None]:
yearly_rev_df

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x='year', y='rev_per_year', data=yearly_rev_df)

Year 2014 was more profitable.

# Feature Engineering

In [None]:
def create_new_df():
    dataset = df_train.copy()
    dataset.dropna(inplace=True)
    dataset['date'] = pd.to_datetime(dataset['date'], dayfirst=True)
    dataset['year_month'] =  dataset['date'].apply(lambda x: x.strftime('%Y-%m'))
    
    """
    - we can drop date and date_block_num column, since we have already extracted year, month and day from date column.
    - we can drop, item_name, item_category_name, shop_name, item_category_id because they are only identifier column.
    - we are not dropping shop_id and item_id, even if they are also identifier column becuase they are mentioned in test.csv
    """
    
    dataset.drop(['date', 'date_block_num', 'item_name', 'item_category_name', 'item_category_id'], axis=1, inplace=True)
    
    # we are summing item count per day for every month of year, because we have to predict sales for next month, So we need to have item count per month instead of item count per day.
    dataset = dataset.groupby(['year_month', 'shop_id', 'item_id', 'item_price']).agg(item_cnt_month = ('item_cnt_day', 'sum')).reset_index()
    dataset = dataset.pivot_table(index=['shop_id','item_id', 'item_price'], columns='year_month', values='item_cnt_month', fill_value=0).reset_index()
    
    # We are summing item count of duplicate items, and taking the mean of price for duplicate items.
    dataset = dataset.groupby(['shop_id', 'item_id']).agg({'item_price': 'mean', '2013-01': 'sum', '2013-02': 'sum', '2013-03': 'sum',
       '2013-04': 'sum', '2013-05': 'sum', '2013-06': 'sum', '2013-07': 'sum', '2013-08': 'sum', '2013-09': 'sum',
       '2013-10': 'sum', '2013-11': 'sum', '2013-12': 'sum', '2014-01': 'sum', '2014-02': 'sum', '2014-03': 'sum',
       '2014-04': 'sum', '2014-05': 'sum', '2014-06': 'sum', '2014-07': 'sum', '2014-08': 'sum', '2014-09': 'sum',
       '2014-10': 'sum', '2014-11': 'sum', '2014-12': 'sum', '2015-01': 'sum', '2015-02': 'sum', '2015-03': 'sum',
       '2015-04': 'sum', '2015-05': 'sum', '2015-06': 'sum', '2015-07': 'sum', '2015-08': 'sum', '2015-09': 'sum',
       '2015-10': 'sum'}).reset_index()
    return dataset

In [None]:
df = create_new_df()

In [None]:
df.columns

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.head(100)

In [None]:
df_test_new = pd.merge(df_test, df, on=['shop_id', 'item_id'], how='left')

In [None]:
df_test_new.head(100)

In [None]:
df_test_new.drop(['ID', '2015-10', 'shop_id', 'item_id'], axis=1, inplace=True)

In [None]:
df_test_new.fillna(0, inplace = True)

In [None]:
train_dataset = df.copy()
Y = train_dataset['2015-10']
X = train_dataset.drop(['2015-10', 'shop_id', 'item_id'], axis = 1)

In [None]:
X.shape

In [None]:
X_test = df_test_new.copy()

In [None]:
X_test.shape

# Model Selection

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

skf = KFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
def apply_k_fold_validation( model, x = X, y = Y ):
    stratified_acc = []
    mse_for_train_set = []
    mse_for_test_set = []
    score=[]
    for train_index, test_index in skf.split(x , y):
        x_train_fold, x_test_fold = x.iloc[train_index.tolist()], x.iloc[test_index.tolist()]
        y_train_fold, y_test_fold = y.iloc[train_index.tolist()], y.iloc[test_index.tolist()]
        model.fit(x_train_fold, y_train_fold)
        mse_for_train_set.append(mean_squared_error(y_train_fold, model.predict(x_train_fold)))
        mse_for_test_set.append( mean_squared_error(y_test_fold, model.predict(x_test_fold)))
        score.append(model.score(x_train_fold,y_train_fold))
    
    print("\n")
    print('Train Set MSE:',np.mean( mse_for_train_set))
    
    print("\n")
    print('Test Set MSE:', np.mean(mse_for_test_set))
    
    print("\n")
    print('Score:', np.mean(score))
    

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
apply_k_fold_validation(linear_model)

## Randon Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest_model = RandomForestRegressor(n_estimators=10, max_depth=16)
apply_k_fold_validation(random_forest_model)

## XGBRegressor

In [None]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(max_depth=16,n_estimators=200,seed=1)
apply_k_fold_validation(xgb_model)

Out of Linear Regression, Decision Tree and XGBRegressor, XGBRegressor performed well with a score of 96% and low MSE score for Train set. Therefore we are going to use XGBRegssor for predicting sales in month of Nov, 2015

# Submission

In [None]:
prediction = random_forest_model.predict(X_test)

In [None]:
df_submission = pd.DataFrame({
    'item_cnt_month': prediction,
}).reset_index()

In [None]:
df_submission.rename(columns={'index':'ID'}, inplace=True)

In [None]:
df_submission.to_csv('submission.csv', index=False)