In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Sales Demand Forecasting

### Business Problem
Forecasting sales is one of the most fundamental problems most retail chains have

### Use-case Definition
Retail Chain A has used traditional forecasting methods to estimate projected sales for each
item across stores, But this method in not accurate.Therefore they need to improve forecasting through advanced analytics and machine learning techniques.

### Data Description
CategoryCode: The items are classified into 4 different categories; this column refers to its code.

ItemCode: An identifier for each item

DateID: The date for the respective sales value

DailySales: Sales value in quantity units for the respective item on the given date.

Week:

WeeklySales: Sales value in quantity units for the respective item for the given week.


# Import the data

In [2]:
tr_sales=pd.read_csv('../input/data-storm-30/train_data.csv')
val_sales=pd.read_csv('../input/data-storm-30/validation_data.csv')

### Validation Data

In [3]:
val_sales.head()# Check the first five data

In [4]:
# Check the first five data
val_sales['Week'].unique(),val_sales.shape

In [5]:
#Convert 
val_sales['Week'].replace(to_replace = ['w1', 'w2','w3','w4'],value =['2/20/2022','2/27/2022','3/6/2022','3/13/2022'],inplace=True)
val_sales['Date'] = pd.to_datetime(val_sales['Week'])
val_sales = val_sales[['CategoryCode','ItemCode','Date','WeeklySales']]
val_sales.head()

In [6]:
val_sales['month']=val_sales['Date'].dt.month
val_sales['day']=val_sales['Date'].dt.day
val_sales.head()

In [7]:
#Dummies = pd.get_dummies(val_sales.CategoryCode)
#val_sales_Merged=pd.concat([val_sales.drop('CategoryCode',axis='columns'),Dummies.drop('category_4',axis='columns')],axis='columns')
#val_sales_Merged

In [36]:
# Validation Split
val_Xtes = val_sales[['ItemCode','month','day']].values#.reshape(-1, 1) #'category_1','category_2','category_3'
val_ytes = val_sales['WeeklySales'].values#.reshape(-1, 1)

### Train Data

In [37]:
tr_sales.head()

In [38]:
tr_sales.isnull().sum()

In [39]:
tr_sales.info()

In [40]:
#Change DateID to date time
tr_sales['Date'] = pd.to_datetime(tr_sales['DateID'])
#tr_sales['Year'] = pd.to_datetime(tr_sales['Date']).dt.year
#tr_sales['Month'] = pd.to_datetime(tr_sales['Date']).dt.month
#tr_sales['Week'] = pd.to_datetime(tr_sales['Date']).dt.week
# tr_sales['Day'] = pd.to_datetime(tr_sales['Date']).dt.day
# tr_sales.sort_values(by=['Date'], inplace=True, ascending=True)
trr_sales=tr_sales[['CategoryCode','ItemCode','Date','DailySales']]
#trr_sales['CategoryCode'].replace(to_replace = ['category_1', 'category_2','category_3','category_4'],value =[1,2,3,4],inplace=True)
trr_sales.head()

In [41]:
t = trr_sales[['CategoryCode','ItemCode']]

In [42]:
#Convert CategoryCode to number
#trrCat_sales = trr_sales['CategoryCode'].replace(to_replace = ['category_1', 'category_2','category_3','category_4'],value =[1,2,3,4],inplace=True)
trr_sales = trr_sales[['ItemCode','Date','DailySales']]
trr_sales.head()

In [43]:
tr_weekly_data = trr_sales.groupby("ItemCode",as_index=False).resample('W-MON',on='Date').sum().reset_index()#.sort_values(by='Date')
#tr_weekly_data = weekly_data[['ItemCode','Date','DailySales']]
tr_weekly_data.head()

In [44]:
tr_weekly_data['WeeklySales'] = tr_weekly_data['DailySales']
train_weekly_data=tr_weekly_data[['Date','ItemCode','WeeklySales']]
train_weekly_data.head()

In [45]:
train_weekly_data['month']=train_weekly_data['Date'].dt.month
train_weekly_data['day']=train_weekly_data['Date'].dt.day
#train_weekly_data

In [49]:
#Train data split
tra_Xtr = train_weekly_data[['ItemCode','month','day']].values#.reshape(-1, 1) #'category_1','category_2','category_3'
tra_ytr = train_weekly_data['WeeklySales'].values#.reshape(-1, 1)
tra_Xtr.shape ,tra_ytr.shape

In [50]:
train_weekly_data['WeeklySales'].describe()#Summary Statistics for Weekly sales data

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [102]:
import pickle
with open('weeklysalespred_model.pickle','wb') as f:
    pickle.dump(Dt,f)

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,Y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores=[]
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algorithm_name, config in algos.items():
        grid_search =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        grid_search.fit(X,Y)
        scores.append({
            'model': algorithm_name,
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(tra_Xtr, tra_ytr)

In [103]:
from sklearn.tree import DecisionTreeRegressor
Dt = DecisionTreeRegressor()
Dt.fit(tra_Xtr, tra_ytr)
DT_pred = Dt.predict(val_Xtes)
#print(lm_pred)
Dt.score(tra_Xtr, tra_ytr)

In [111]:
#Total Mean Absolute Percentage Error (Total MAPE) for evaluation.
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(val_ytes, DT_pred)

In [112]:
test_sales=pd.read_csv('../input/data-storm-30/test_data.csv')
test_sales.head()

In [113]:
#Convert 
test_sales['Week'].replace(to_replace = ['w1', 'w2','w3','w4'],value =['2/20/2022','2/27/2022','3/6/2022','3/13/2022'],inplace=True)
test_sales['Date'] = pd.to_datetime(test_sales['Week'])
test_sales['month']=test_sales['Date'].dt.month
test_sales['day']=test_sales['Date'].dt.day
test_sales = test_sales[['ItemCode','month','day']]
test_sales.head()

In [114]:
test=pd.read_csv('../input/data-storm-30/test_data.csv')
test = test[['CategoryCode','ItemCode','Week']]

In [115]:
Dt_pred = Dt.predict(test_sales)
Dt_pr = pd.DataFrame(Dt_pred,columns =['PredictedSales'])

PredictedSales = pd.concat([test,Dt_pr],axis=1)
PredictedSales

In [108]:
PredictedSales.to_csv('PredictedSales.csv')