# Referances

https://www.kaggle.com/code/howoojang/first-kaggle-notebook-following-ts-tutorial/notebook#Equador's-economy-is-dependent-on-the-crude-oil-price.-Let's-examine-first-the-relationship-between-crude-oil-and-grocery-sales-and-transactions.

# Imports

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt        
import seaborn as sns
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path = '/kaggle/input/store-sales-time-series-forecasting/'
os.listdir(path)

In [None]:
data_oil = pd.read_csv(path+'oil.csv')
train_data = pd.read_csv(path+'train.csv', index_col=0)
test_data = pd.read_csv(path+'test.csv', index_col=0)
samp_subm = pd.read_csv(path+'sample_submission.csv')
data_holi = pd.read_csv(path+'holidays_events.csv')
data_store =  pd.read_csv(path+'stores.csv')
data_trans = pd.read_csv(path+'transactions.csv')

# Data details

In [None]:
print(f'Number of data_oil samples: {data_oil.shape}')
print(f'Number of train_data samples: {train_data.shape}')
print(f'Number of test_data samples: {test_data.shape}')
print(f'Number of samp_subm samples: {samp_subm.shape}')
print(f'Number of data_holi samples: {data_holi.shape}')
print(f'Number of data_store samples: {data_store.shape}')
print(f'Number of data_trans samples: {data_trans.shape}')

In [None]:
print(train_data.info())
print(train_data.columns)
print(train_data.head())
print(data_trans.head())
print(data_oil.head())

# relationships

## Crude oil price

Here the net sales are dependant on individual csv so examining all the relationship

In [None]:
ax = data_oil.set_index('date').plot(figsize = (16, 8))
ax.set_xlabel('Date', fontsize = 'large')
ax.set_ylabel("Crude Oil", fontsize = 'large')

In [None]:
print(data_oil.head())

## Sales - weekly vs sparse data

In [None]:
avg_sales = train_data.groupby('date').agg({'sales': 'mean'}).reset_index()
avg_sales['weekly_avg_sales'] = avg_sales['sales'].ewm(span=7, adjust=False).mean()
avg_sales.plot(x= 'date', y= ['sales', 'weekly_avg_sales'], figsize=(18,6))
print(avg_sales.head())

## Transactions - weekly vs sparse data

In [None]:
avg_transactions = data_trans.groupby('date').agg({'transactions': 'mean'}).reset_index()
#avg_transaction['weekly_avg_sales'] = avg_transaction['transactions'].rolling(window=7).mean()
avg_transactions['weekly_avg_transactions'] = avg_transactions['transactions'].ewm(span=7, adjust=False).mean()
avg_transactions.plot(x= 'date', y= ['transactions', 'weekly_avg_transactions'], figsize=(18,6))
print(avg_transactions.head())

In [None]:
sales_transaction = pd.DataFrame()
sales_transaction['sales'] = avg_sales['sales']
sales_transaction['transacions'] = avg_transactions['transactions']
sales_transaction['oil'] = data_oil['dcoilwtico']
sales_transaction.corr()



So sales and transaction has less relation, but oil and sales have relationship

## Category sales' influence in train data

In [None]:
print(train_data.family.unique())
print(len(train_data.family.unique()))
train_data['family'] = train_data['family'].astype('category')
train_data['family_category'] = train_data['family'].cat.codes

family_category = dict( zip( train_data['family'].cat.codes, train_data['family'] ) )
family_category

In [None]:
data_grouped_family_types = train_data.groupby(['family_category']).mean()[['sales', 'onpromotion']]

data_grouped_family_types['%_s'] = 100 * data_grouped_family_types['sales'] / data_grouped_family_types['sales'].sum()
data_grouped_family_types['%_s'] = data_grouped_family_types['%_s'].round(decimals = 3)

percent = 100 * data_grouped_family_types['sales'] / data_grouped_family_types['sales'].sum()
percent = percent.round(decimals = 3)
patches, texts = plt.pie(data_grouped_family_types['%_s'], startangle=90, radius=1.5)

lables_2 = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(family_category.values(), percent)]

sort_legend = True
if sort_legend:
    patches, labels, dummy =  zip(*sorted(zip(patches, lables_2, data_grouped_family_types['%_s']),
                                          key=lambda x: x[2],
                                          reverse=True))
    
plt.legend(patches, labels, loc='best', bbox_to_anchor=(-0.1, 1.),
           fontsize=8)

In [None]:
data_grouped_family_types = train_data.groupby(['family_category']).mean()[['sales', 'onpromotion']]

data_grouped_family_types['%_p'] = 100 * data_grouped_family_types['onpromotion'] / data_grouped_family_types['onpromotion'].sum()
data_grouped_family_types['%_p'] = data_grouped_family_types['%_p'].round(decimals = 3)


percent = 100 * data_grouped_family_types['onpromotion'] / data_grouped_family_types['onpromotion'].sum()
percent = percent.round(decimals = 3)
patches, texts = plt.pie(data_grouped_family_types['%_p'], startangle=90, radius=1.5)


lables_2 = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(family_category.values(), percent)]


sort_legend = True
if sort_legend:
    patches, labels, dummy =  zip(*sorted(zip(patches, lables_2, data_grouped_family_types['%_p']),
                                          key=lambda x: x[2],
                                          reverse=True))
    
plt.legend(patches, labels, loc='best', bbox_to_anchor=(-0.1, 1.),
           fontsize=8)

The top 5 most sold are Grocery, beverages, cleaning, dairy, and produce. Grocery + beverage account for more than 50% of total sales.

## sales in different time frame