## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## Reading data

In [None]:
os.chdir('/kaggle/input/wallmart-sales/')
evaluation_df = pd.read_csv('sales_train_evaluation.csv')
evaluation_df = evaluation_df.sort_values(by=['id']).reset_index().drop(['index'],1)
eval_T_df = evaluation_df.set_index('id').drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],1).T
eval_T_df = eval_T_df.reset_index().drop(['index'],1)
df2 = eval_T_df.copy()
df2 = df2.rename(lambda x: x[:-10] + "validation", axis='columns')
df2.head()

In [None]:
os.chdir('/kaggle/input/wallmart/')
cal = pd.read_csv('calendar.csv')
sales = pd.read_csv('sales_train_validation.csv')
sell_prices = pd.read_csv('sell_prices.csv')

### Calculating quantities

In [None]:
cal['date']=pd.to_datetime(cal['date'])
cal = cal.rename(columns = {'d':'id'})
cal_1941 = cal[:1941]
cal_1941.head()

In [None]:
cal_sales = pd.concat([cal_1941,df2],axis=1)
cal_sales.head()

In [None]:
cal_sales.tail()

### Calculating prices

In [None]:
sell_prices['state_id'] = sell_prices.item_id.map(str) \
                          + '_'  + sell_prices.store_id.map(str) + '_validation' \

sell_prices.head()            

In [None]:
sell_prices[(sell_prices['state_id'] == 'HOBBIES_1_001_CA_1_validation') & (sell_prices['wm_yr_wk']>11613)]

In [None]:
main_set = set(cal_sales.columns[14:])         # Set of all items
prices_arr = np.zeros((1,30490))               # Initializing prices with zeros as the first observation
for week_no in cal_sales['wm_yr_wk'].unique() :      
    single_row_sell = sell_prices[sell_prices['wm_yr_wk'] == week_no][['sell_price','state_id']]  # Retrieving prices of all items in the week
    differ_set = main_set.difference(set(single_row_sell['state_id'])) # Finding missing items with no price tag in the week 
    data = {'sell_price':[np.nan]*len(differ_set),'state_id':list(differ_set)} # Assigning NaN values to those missing items
    dfl = pd.DataFrame.from_dict(data) # Converting missing items (with NaN tags) to a dataframe
    new_df = pd.concat([single_row_sell,dfl],axis=0).sort_values('state_id').reset_index().iloc[:,1:] # Concatinating items with and without prices in the week
    prices_arr = np.vstack((prices_arr,np.array(list(new_df.set_index('state_id').T.values)*7))) # Stacking prices of each week

prices_arr = prices_arr[1:-5] # Removing the intial zeros we initialized with and the extra prices in the last week
prices_arr

### Multiplying quantities and prices

In [None]:
total_value = cal_sales.iloc[:,14:] * prices_arr

### Data pre-processing

In [None]:
total_value['date'] = cal_sales['date']
total_value['date'] = pd.to_datetime(total_value['date'])
df3 = total_value[total_value['date'] >='2015-02-22']
df3.dropna(axis=1, inplace=True)

In [None]:
for i in range(29922):
    i = df3.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    if item not in df3.columns:
        df3[item] = 0
for i in range(29922):
    i = df3.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    df3[item] += df3[i]

In [None]:
# Removing Chirstmas holidays
import re
def func(datetime):
    pattern = '^201[1-6]-12-25$'
    result = re.match(pattern, datetime.strftime("%Y-%m-%d"))
    if result:
        return 1
    return 0

df3 = df3[df3['date'].apply(func) != 1]

In [None]:
df4 = total_value[total_value['date'] >='2014-05-22']
df4.dropna(axis=1, inplace=True)

In [None]:
for i in range(28012):
    i = df4.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    if item not in df4.columns:
        df4[item] = 0
for i in range(28012):
    i = df4.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    df4[item] += df4[i]

In [None]:
# Removing Chirstmas holidays
import re
def func(datetime):
    pattern = '^201[1-6]-12-25$'
    result = re.match(pattern, datetime.strftime("%Y-%m-%d"))
    if result:
        return 1
    return 0

df4 = df4[df4['date'].apply(func) != 1]

## EDA

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(29925,29946,10): 
    i = df3.columns[i]
    print(i)
    fig.add_trace(go.Scatter(x=df3['date'], y=df3[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]))
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df = df3[df3['date'] >='2016-02-22']


fig = go.Figure()

for i in range(29925,29946,10): 
    i = df.columns[i]
    print(i)
    fig.add_trace(go.Scatter(x=df['date'], y=df[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]))
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df = df3[(df3['date'] >='2015-02-22') & (df3['date'] <='2015-05-22')]

fig = go.Figure()

for i in range(29925,29946,10): 
    i = df.columns[i]
    print(i)
    fig.add_trace(go.Scatter(x=df['date'], y=df[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]))
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df = df3[df3['date'] >='2016-02-22']
df1 = df3[(df3['date'] >='2015-02-22') & (df3['date'] <='2015-05-22')]


fig = go.Figure()

for i in range(29925,29946,10): 
    i = df.columns[i]
    fig.add_trace(go.Scatter(x=df['date'], y=df1[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]+"_2015"))
    fig.add_trace(go.Scatter(x=df['date'], y=df[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]+"_2016"))
    
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df = df4[(df4['date'] >='2014-05-23') & (df4['date'] <='2014-06-19')]
df1 = df4[(df4['date'] >='2015-05-23') & (df4['date'] <='2015-06-19')]

fig = go.Figure()

for i in range(28015,28036,10): 
    i = df.columns[i]
    print(i)
    fig.add_trace(go.Scatter(x=df['date'], y=df1[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]+"_2015"))
    fig.add_trace(go.Scatter(x=df['date'], y=df[i].rolling(window=7).mean(),
                        mode='lines',
                        name=i.split('_')[2]+"_2014"))
    
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()