# Expoloratory Data Analysis

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

In [36]:
df = pd.read_csv('data/train.csv')
display(df.head())
display(df.tail())
df.shape

Unnamed: 0,Date,store,product,number_sold
0,2010-01-01,0,0,801
1,2010-01-02,0,0,810
2,2010-01-03,0,0,818
3,2010-01-04,0,0,796
4,2010-01-05,0,0,808


Unnamed: 0,Date,store,product,number_sold
230085,2018-12-27,6,9,890
230086,2018-12-28,6,9,892
230087,2018-12-29,6,9,895
230088,2018-12-30,6,9,899
230089,2018-12-31,6,9,912


(230090, 4)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230090 entries, 0 to 230089
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Date         230090 non-null  object
 1   store        230090 non-null  int64 
 2   product      230090 non-null  int64 
 3   number_sold  230090 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 7.0+ MB


We'll convert the Date column to datetime format and set it as the index of the DataFrame, and the store and product to a category

In [38]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

df['store'] = df['store'].astype('category')
df['product'] = df['product'].astype('category')

In [130]:
# Example of what a time series would look like
df_viz1, df_viz2, df_viz3 = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
swatch = px.colors.qualitative.Dark2

for df_ in [df_viz1, df_viz2, df_viz3]:
    random_store, random_product = df['store'].sample(1).values[0],df['product'].sample(1).values[0]

    df_ = df[(df['store'] == random_store) & (df['product'] == random_product)]

    fig = px.line(df_, 
                  x=df_.index, 
                  y='number_sold', 
                  title=f'Monthly Value for Store {random_store}, Product {random_product}', 
                  labels={'value':'Value', 'Period':'Month'})
    
    fig.update_layout(height=300, plot_bgcolor='white', paper_bgcolor='white', showlegend=False)
    fig.update_traces(line=dict(color=np.random.choice(swatch, 1)[0]))  # Change the color of the line
    
    fig.show()

Clearly, each store product combination has a different trend. We can see that there is a trend and seasonality in the data.

There are 7 stores and 10 products.
We can create a time series for each store and product combination. This would result in 70 different time series associated with every unique store-product combination.

We can also create a time series for each store, which would be an aggregation of all products sold in that store. This would result in 7 time series

In [106]:
df_store_groupedSales = df.groupby(['store','Date',])['number_sold'].sum()

px.line(df_store_groupedSales.reset_index(), x='Date', y='number_sold', color='store', title='Monthly Sales per Store')



