In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install py7zr
import py7zr

In [None]:
import py7zr
from subprocess import check_output

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        archive = py7zr.SevenZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
train = pd.read_csv("../working/train.csv")
#test = pd.read_csv("../working/test.csv")
#sample_sub = pd.read_csv("../working/sample_submission.csv")
stores = pd.read_csv("../working/stores.csv")
items = pd.read_csv("../working/items.csv")
#transactions = pd.read_csv("../working/transactions.csv")
oil = pd.read_csv("../working/oil.csv")
holiday = pd.read_csv("../working/holidays_events.csv")

# Basic EDA

In [None]:
print("Shape of train:" , train.shape)
print("Shape of test:" , test.shape)
print("Shape of stores:" , stores.shape)
print("Shape of items:" , items.shape)
print("Shape of transactions:" , transactions.shape)
print("Shape of oil:" , oil.shape)
print("Shape of holiday:" , holiday.shape)

In [None]:
print(train.info(),"\n")
print(test.info(),"\n")
print(stores.info(),"\n")
print(items.info(),"\n")
print(transactions.info(),"\n")
print(oil.info(),"\n")
print(holiday.info())

## 1) Train Set

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
stores.head()

In [None]:
# Converting date column to datetime type to reduce the memory usage
print("size before:", train["date"].memory_usage(deep=True) * 1e-6)
train["date"] = pd.to_datetime(train["date"])
print("size after: ", train["date"].memory_usage(deep=True) * 1e-6)

### 1) Filtering out stores where city = 'Daule','Quito','Santo Domingo'
### 2) Filtering out stores where family = 'Dairy','Bread/Bakery'
### NOTE:Doing mannual filtering because not able to apply join or any other technique on such a big file due to CPU and RAM restrictions.

In [None]:
#The below code can be used with a better CPU and RAM access
#merged_df = train.merge(stores, on='store_nbr', how='left')

In [None]:
store_number = (stores.loc[(stores['city']=='Daule') | (stores['city']=='Quito') | (stores['city']=='Santo Domingo')])['store_nbr'].tolist()
print("Stores which are present in these 3 citites:","\n",store_number)

In [None]:
item_number = (items.loc[(items['family']=='BREAD/BAKERY') | (items['family']=='DAIRY')])['item_nbr'].tolist()

In [None]:
train_subset = train[train['store_nbr'].isin(store_number) & train['item_nbr'].isin(item_number)]
print(train_subset.shape)

In [None]:
train_subset.head()

### Left join on "Train & Stores","Train & Items", "Train & Oil" and "Train & Holiday". 
### Note: Here we are using the train set for LEFT JOIN after applying given filters so that it will consume lesser amount of memory.

In [None]:
# Left Join - Train & Stores
train_subset = pd.merge(train_subset, stores, on = 'store_nbr', how = 'left')
train_subset.head()

In [None]:
# Left Join - Train & Items 
train_subset = pd.merge(train_subset, items, on = 'item_nbr', how = 'left')
train_subset.head()

In [None]:
# Left Join - Train & Oil
train_subset = pd.merge(train_subset, oil, on = 'date', how = 'left')
train_subset.head()

### ASSUMPTION 1: Dates having type 'Additional','Bridge','Event',and 'Transfer' are considered to be a holiday. Hence there will be only two types of day categories available - a) Work Day and b) Holiday 

In [None]:
holiday['type'] = holiday['type'].replace(['Additional','Bridge','Event','Transfer'], 'Holiday')
mask = (holiday['transferred'] == True)
holiday['type'][mask] = 'Work Day'
print(holiday['type'].value_counts())

In [None]:
# Left Join - Train & Holiday
train_subset = pd.merge(train_subset, holiday, on = 'date', how = 'left')
train_subset = train_subset.drop(['locale', 'locale_name','description','transferred'], axis=1)
train_subset = train_subset.rename(columns={"type_y": "day_type", "type_x": "type","dcoilwtico":"oil_price"})
train_subset.head()

### Handling Missing Values in Train Set

In [None]:
train_subset.isnull().sum().sort_values(ascending=False)

### ASSUMPTION 2: Dates which are not present in the holiday dataset are considered to be Work Day. 

In [None]:
# Replacing NA values in day_type column with Work Day
train_subset['day_type'] = train_subset['day_type'].fillna("Work Day")

In [None]:
# Replace missing values in Oil_Price
train_subset['oil_price'] = train_subset["oil_price"].fillna(axis = 0,method = 'ffill')

### ASSUMPTION 3: Considering the missing values in onpromotion field as "Not Mentioned". Hence there will be 3 categories available in this field - 'True', 'False', and 'Not Mentioned'.

In [None]:
# Creating a new category in onpromotion column, where NA values are replaced with "Not Mentioned"
train_subset['onpromotion'] = train_subset['onpromotion'].fillna("Not Mentioned")

### Removing the datasets which are not required anymore to clear some space.

In [None]:
#del oil
#del holiday
#del items
#del stores

### Adding additional fields (for EDA purpose) - Month & Year using Date column

In [None]:
train_subset["date"] = pd.to_datetime(train_subset["date"])
train_subset['Month'] = train_subset['date'].dt.strftime('%B')
train_subset['Year'] = train_subset['date'].dt.strftime('%Y')

In [None]:
train_subset.head()

## Detailed EDA

In [None]:
import altair as alt

### 1) Yearly Transactions

In [None]:
year_df = train_subset['Year'].value_counts().to_frame().reset_index().rename(columns={'index':'Year','Year':'count'}).sort_values(by = 'Year')
print(year_df)

In [None]:
bars = alt.Chart(year_df).mark_bar(color="purple").encode(
    x='Year',
    y='count',
    tooltip=[alt.Tooltip('count:Q')]
    
)

text = bars.mark_text(
    align='center',
    baseline='middle',
    dy=-7 ,
    size=15,
).encode(text='count')

(bars + text).properties(
    width=400,
    height=400,
    title="Yearly Transactions")


#### Interpretation: The retail store has seen a constant annual growth rate of around 20% in all the years except for 2013-2014 where the growth was 40%.

### 2) Monthly Transactions over 4 years

In [None]:
month_df = train_subset.groupby(['Month','Year']).size().reset_index().rename(columns={0:'count'})
month_df['Year'] = month_df['Year'].astype('category')
month_df['Month'] = month_df['Month'].astype('category')
month_df['Month_Year'] = month_df['Month'].astype(str)+"-"+month_df['Year'].astype(str)

In [None]:
bars=alt.Chart(month_df).mark_bar().encode(
    x='count',
    y='Month',
    color=alt.Color('Year',title='Year'),
    tooltip=[alt.Tooltip('Month_Year:N'),
             alt.Tooltip('Month:N'),
             alt.Tooltip('count:Q'),
            ]
    
).properties(
    width=550,
    height=400,
    title="Monthly Transactions over 4 years -  (Hover over each segment of Bar to understand distribution)")

text = alt.Chart(month_df).mark_text(dx=-20, dy=3, color='white').encode(
    x=alt.X('count', stack='zero'),
    y=alt.Y('Month',title="Month"),
    detail='Month_Year',
    text=alt.Text('count'))

bars+text


#### Interpretation: Every year, the sales starts from a low point and as we progress to the second half of the year, sales starts to increase which can be seen as a monthly seasonality. And we can also see that Month on Month sales for all the months are increasing every year which is a good sign for the retail store. 

### 3) How many stores are there in a city ?

In [None]:
city_store_df = train_subset[['store_nbr','city']].drop_duplicates().groupby('city').size().to_frame().reset_index().rename(columns={0:'count'})


In [None]:
bars=alt.Chart(city_store_df).mark_bar(color="darkorange").encode(
    x='count',
    y=alt.Y('city', sort='-x'),
     tooltip=[alt.Tooltip('count:Q')]
)

text = bars.mark_text(
    align='center',
    baseline='middle',
    dx=7 ,
    size=12,
).encode(
    text='count')

(bars + text).properties(
    width=600,
    height=200,
    title="Stores in each city")


#### Interpretation: Quito has the maximum number of stores while Santo Domingo and Daule seem to be  new markets for the retail store with only 3 and 1 stores respectively.  

### 4) Footfall Comparison in each store

In [None]:
store_df=train_subset['store_nbr'].value_counts().to_frame().reset_index().rename(columns={'index':'Store_No','store_nbr':'total'}).sort_values(by = 'Store_No')

In [None]:
alt.Chart(store_df).transform_joinaggregate(
    TotalTime='sum(total)',
).transform_calculate(
    Percent_Of_Total_Transactions="datum.total / datum.TotalTime"
).mark_bar(color="maroon").encode(
    alt.X('Percent_Of_Total_Transactions:Q', axis=alt.Axis(format='.0%')),
    y='Store_No:N',
    tooltip=[alt.Tooltip('Store_No:N'),
             alt.Tooltip('Percent_Of_Total_Transactions:Q')]
).properties(height=500,width=300,title="Distribution of footfalls in each store-(Hover over each segment of Bar to understand distribution)")


#### Interpretation: Out of all 22 stores, Store 10, Store 16, Store 17, Store 20 and Store 21 are amongst the bottom 5 stores in terms of customers footfall.

### 5) Correlation between Oil Prices and Unit Sales

In [None]:
oil_and_sales= train_subset[['date','oil_price','unit_sales']]
d = {'oil_price':'oil_price', 'unit_sales':'total_sales'}
oil_and_sales = oil_and_sales.groupby('date').agg({'oil_price':'mean', 'unit_sales':'sum'}).rename(columns=d)

In [None]:
alt.Chart(oil_and_sales.sample(500)).mark_circle().encode(
    alt.X('oil_price',title="Oil Price"),
    alt.Y('total_sales',title="Total_Sales"),
     tooltip=[alt.Tooltip('oil_price'),
            alt.Tooltip('total_sales')]
    
).configure_mark(color='green').properties(width=600,height=400,title="Oil Price vs Total Sales")


#### Interpretation: When oil prices are in the range of 30-60, total unit sold per day is not varying too much, but when oil price is increasing beyound 60 the total unit sold per day reduces by around 10k to 15k. This shows that the citizens of Ecuador face an impact of oil prices and their buying behaviour also changes when the oil prices increase beyond 60.

### 6) Product family demand comparison

In [None]:
family_df = train_subset[['family','unit_sales']]

In [None]:
domain = ['BREAD/BAKERY', 'DAIRY']
range_ = ['green','orange']
alt.Chart(family_df.sample(5000)).mark_boxplot().encode(
    alt.X('family:O',sort=['BREAD/BAKERY','DAIRY']),
    alt.Y('unit_sales:Q',title="Unit Sales under each product family"),
    color=alt.Color('family', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=[alt.Tooltip('family:O'),
            alt.Tooltip('unit_sales')]
).configure_mark().properties(width=600,height=400,title="Unit Sales vs Product Family (Five point summary)")


#### Interpretation: The units sold for both the product categories (i.e. Bread/bakery and Dairy) are almost similar and therefore their median, Quartile1 and Quartile 2 are approximately same. The only difference is in their Max value of Units sold and according to the visualised figure, we can observe that bread/bakery product has more units sold. We can also observe that the outliers for dairy products have more variation compared to bread/bakery product.

### 7) Citywise Sales of each product family

In [None]:
city_df = train_subset[['city','family','unit_sales']]

In [None]:
alt.Chart(city_df.sample(5000)).mark_rect().encode(
    x='city',
    y='family',

    color=alt.Color('unit_sales',scale=alt.Scale(type='log',scheme='reds')),
    tooltip=['city','family','unit_sales']
).properties(width=600,height=400,title="Citywise Avg Unit Sales for each product family - Hover over each block to see the numbers")


#### Interpretation: Based on the random sample taken for this visualisation from our dataset, we observe that people staying in city Quito are consuming bread/bakery products the most as compared to the other two cities. In Santo Domingo, people prefer dairy products over bread/bakery products and similar is the case with people staying in city Quale.
#### Note: The results of this visualisation may vary in every attempt to run the code as I am picking up 5000 random samples.

### Forecasting

In [None]:
!pip install sklearn
import xgboost as xgb
import random
from sklearn.model_selection import train_test_split

In [None]:
# Only use this when you have removed all the files after EDA for space issues. 
# train_subset = pd.read_csv('./mycsvfile.csv')

### 1) Adding day and quarter fields using date field

In [None]:
#train_subset['date'] = pd.to_datetime(train_subset['date'],format = '%Y-%m-%d')
train_subset['day'] = train_subset['date'].dt.day
train_subset['quarter'] = train_subset['date'].dt.quarter
train_subset['month'] = train_subset['date'].dt.month
train_subset['year'] = train_subset['date'].dt.year

In [None]:
train_subset.head()

### 2) Dropping columns which are correlated and hence will not be used

In [None]:
train_subset = train_subset.drop(['city','state','perishable','type','cluster','class','date','Month','Year'], axis=1)

In [None]:
train_subset.head()

In [None]:
train_subset['onpromotion'] = train_subset['onpromotion'].replace(True,1)
train_subset['onpromotion'] = train_subset['onpromotion'].replace(False,0)
train_subset['onpromotion'] = train_subset['onpromotion'].replace('Not Mentioned',2)
train_subset['family'] = train_subset['family'].replace('BREAD/BAKERY',0)
train_subset['family'] = train_subset['family'].replace('DAIRY',1)
train_subset['day_type'] = train_subset['day_type'].replace('Holiday',0)
train_subset['day_type'] = train_subset['day_type'].replace('Work Day',1)

In [None]:
train_subset.to_csv('train_subset')

In [None]:
#train_subset['onpromotion'] = train_subset['onpromotion'].astype(int)
# train1
train1 = train_subset.drop(['unit_sales'], axis = 1)
print(train1.head())

# train2
train2 = train_subset.drop(['id','store_nbr','item_nbr','onpromotion', 'family','oil_price','day_type','month','year','day','quarter'], axis = 1)
print(train2.head())


In [None]:
Xg_train, Xg_valid = train_test_split(train1, test_size=0.20, random_state=10)
Yg_train, Yg_valid = train_test_split(train2, test_size=0.20, random_state=10)
features1 = list(train1.columns.values)
features2 = list(train2.columns.values)

In [None]:
print(features1)
print(features2)


In [None]:
dtrain = xgb.DMatrix(Xg_train[features1],Yg_train[features2])
dvalid = xgb.DMatrix(Xg_valid[features1],Yg_valid[features2])

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#def rmspe(y, yhat):
    #return np.sqrt(np.mean((yhat / y-1) ** 2))
def rmspe(y,yhat):
    return

In [None]:
import math
from sklearn.preprocessing import minmax_scale

In [None]:

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = minmax_scale(np.sqrt(np.mean([(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)])),feature_range=(0,1))
    return out_put

In [None]:
def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    y1 = np.expm1(yhat)
    return "rmspe", rmspe(y, yhat)


In [None]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 15
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]


In [None]:
gbm = xgb.train(params, dtrain, num_boost_round, evals = watchlist,
  early_stopping_rounds = 5, feval = rmspe_xg, verbose_eval = True)


In [None]:
yhat = gbm.predict(xgb.DMatrix(Xg_valid[features1]))
error = rmspe(Yg_valid.unit_sales.values, np.expm1(yhat))

## Prepare Test set for Forecasting
### Note: All the assumptions taken on Train Set are same for Test Set as well.

In [None]:
#test = pd.read_csv('test.csv')
#del test

In [None]:
# Filtering out data based on city and family
test_subset = test[test['store_nbr'].isin(store_number) & test['item_nbr'].isin(item_number)]
print(test_subset.shape)

In [None]:
# Left Join - Test & Stores
test_subset = pd.merge(test_subset, stores, on = 'store_nbr', how = 'left')
test_subset.head()

In [None]:
# Left Join - Test & Items 
test_subset = pd.merge(test_subset, items, on = 'item_nbr', how = 'left')
test_subset.head()

In [None]:
# Left Join - Test & Oil
test_subset = pd.merge(test_subset, oil, on = 'date', how = 'left')
test_subset.head()

In [None]:
# Left Join - Test & Holiday
test_subset = pd.merge(test_subset, holiday, on = 'date', how = 'left')
test_subset = test_subset.drop(['locale', 'locale_name','description','transferred'], axis=1)
test_subset = test_subset.rename(columns={"type_y": "day_type", "type_x": "type","dcoilwtico":"oil_price"})
test_subset.head()

### Handling Missing Values in Test Set

In [None]:
test_subset.isnull().sum().sort_values(ascending=False)

In [None]:
# Replacing NA values in day_type column with Work Day
test_subset['day_type'] = test_subset['day_type'].fillna("Work Day")
# Replace missing values in Oil_Price
test_subset['oil_price'] = test_subset["oil_price"].fillna(axis = 0,method = 'ffill')
# Creating a new category in onpromotion column, where NA values are replaced with "Not Mentioned"
test_subset['onpromotion'] = test_subset['onpromotion'].fillna("Not Mentioned")

In [None]:
test_subset["date"] = pd.to_datetime(test_subset["date"])
test_subset['Day'] = test_subset['date'].dt.strftime('%d')
test_subset['Month'] = test_subset['date'].dt.strftime('%m')
test_subset['Year'] = test_subset['date'].dt.strftime('%Y')
test_subset['quarter'] = test_subset['date'].dt.quarter
test_subset.head()

### Keeping only those columns which are used in Train set

In [None]:
test_subset = test_subset.drop(['date','city','state','perishable','type','cluster','class'], axis=1)
test_subset.head()

### Converting Categorical variables into Numerical variables.

In [None]:
test_subset['onpromotion'] = test_subset['onpromotion'].replace(True,1)
test_subset['onpromotion'] = test_subset['onpromotion'].replace('False',0)
test_subset['onpromotion'] = test_subset['onpromotion'].replace('Not Mentioned',2)
test_subset['family'] = test_subset['family'].replace('BREAD/BAKERY',0)
test_subset['family'] = test_subset['family'].replace('DAIRY',1)
test_subset['day_type'] = test_subset['day_type'].replace('Holiday',0)
test_subset['day_type'] = test_subset['day_type'].replace('Work Day',1)

In [None]:
#test_subset['onpromotion'] = test_subset['onpromotion'].replace(0.0,'0')
#test_subset['onpromotion'] = test_subset['onpromotion'].replace(1.0,'1')
#test_subset['onpromotion'].value_counts()

In [None]:
# Converting Object type into int
test_subset['onpromotion'] = test_subset['onpromotion'].astype(int)
test_subset['day'] = test_subset['Day'].astype(int)
test_subset['month'] = test_subset['Month'].astype(int)
test_subset['year'] = test_subset['Year'].astype(int)
test_subset.dtypes

In [None]:
# Saving the file in the kernel
#test_subset.to_csv('test_subset.csv')

In [None]:
test_dmatrix = xgb.DMatrix(test_subset[features1])

In [None]:
test_prediction = gbm.predict(test_dmatrix)
print("Predictions")

In [None]:
result = pd.DataFrame({"id": test_subset["id"], 'unit_sales': np.expm1(test_prediction)})
result.to_csv("final_submission.csv", index=False)
print("Submitted the final output file in Kaggle's kernel")