# Init Analysis
Explore the input data  
Oct 5th 2022

In [1]:
# General modules:
import os
import time
import itertools
# Date-related modules:
import calendar as cal
from datetime import datetime, timedelta
# Data manipulation:
import pandas as pd
import numpy as np
# Plot modules:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from IPython.display import display
# Time-series related:
from scipy.signal import periodogram
from statsmodels.tsa.stattools import pacf, acf
from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.graphics.tsaplots import month_plot

In [2]:
# Pandas config
pd.options.display.float_format= '{:,.4f}'.format

# 1) Input data

### 1.1) Training data
Represents 99% of the data

In [3]:
training_df = pd.read_csv("../data/raw_data/train.csv")
print(f"Num of rows: {len(training_df) : ,.2f}")
training_df

Num of rows:  3,000,888.00


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0000,0
1,1,2013-01-01,1,BABY CARE,0.0000,0
2,2,2013-01-01,1,BEAUTY,0.0000,0
3,3,2013-01-01,1,BEVERAGES,0.0000,0
4,4,2013-01-01,1,BOOKS,0.0000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.1330,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.5530,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.7290,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0000,8


In [4]:
training_df["date"].min()

'2013-01-01'

In [5]:
training_df["date"].max()

'2017-08-15'

### 1.2) Test data
Rpresents 1% of the data

In [6]:
test_df = pd.read_csv("../data/raw_data/test.csv")

print(f"Num of rows: {len(test_df):,.2f}")
test_df

Num of rows: 28,512.00


Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


### 1.3) Holidays

In [7]:
holidays_df = pd.read_csv("../data/raw_data/holidays_events.csv")

holidays_df

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


### 1.4) Oil dataset

In [8]:
oil_df = pd.read_csv("../data/raw_data/oil.csv")

print(f"Num of rows: {len(oil_df): ,.2f}")
oil_df

Num of rows:  1,218.00


Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.1400
2,2013-01-03,92.9700
3,2013-01-04,93.1200
4,2013-01-07,93.2000
...,...,...
1213,2017-08-25,47.6500
1214,2017-08-28,46.4000
1215,2017-08-29,46.4600
1216,2017-08-30,45.9600


### 1.5) Stores

In [9]:
stores_df = pd.read_csv("../data/raw_data/stores.csv")

print(f"Num of rows of store_df: {len(stores_df): ,.2f}")
stores_df.head()

Num of rows of store_df:  54.00


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


### 1.6) Transactions

In [10]:
transactions_df = pd.read_csv("../data/raw_data/transactions.csv")

print(f"Num of rows: {len(transactions_df): ,.2f}")
transactions_df

Num of rows:  83,488.00


Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932
