In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

## Upload datasets

Read all .csv paths

In [None]:
dataset_paths = {
    'categories': '../input/competitive-data-science-predict-future-sales/item_categories.csv',
    'items': '../input/competitive-data-science-predict-future-sales/items.csv',
    'sales': '../input/competitive-data-science-predict-future-sales/sales_train.csv',
    'shops': '../input/competitive-data-science-predict-future-sales/shops.csv',
    'test': '../input/competitive-data-science-predict-future-sales/test.csv'
}

Upload data from files

In [None]:
dataset = {key: pd.read_csv(value) for key, value in dataset_paths.items()}

## Make migrations between datasets

In [None]:
dataset['sales'].head()

# Get common statistics 

In [None]:
plt.figure(figsize=(28,20))

sell_quantity = dataset['sales'].groupby('item_cnt_day').size()
sns.distplot(a=sell_quantity, kde=False)

## Total items sales per month

In [None]:
month_total_cnt = dataset['sales'].groupby('date_block_num').item_cnt_day.sum()

sns.lineplot(data=month_total_cnt)

In [None]:
sales = dataset['sales']['item_cnt_day'] * dataset['sales']['item_price']
sales

In [None]:
month_sales = pd.DataFrame({'month':dataset['sales']['date_block_num'], 'sales':sales})
month_sales = month_sales.groupby('month').sales.sum()

In [None]:
sns.lineplot(data=month_sales)

## Does item price change with time ?? 

In [None]:
prices = dataset["sales"].groupby(['date_block_num', 'item_id']).item_price.mean().groupby('item_id')
prices.std()

In [None]:
print(f"{prices.std().max()} - idx {prices.std().idxmax()}")

dataset['sales'][dataset['sales'].item_id == prices.std().idxmax()].groupby('date_block_num').item_price.mean()

## Merge item_category_id into main dataset

In [None]:
data_joined = dataset['sales'].merge(dataset['items'], on='item_id')
data_joined

In [None]:
data_joined.info()

In [None]:
data_joined.describe()

In [None]:
# help(sns.scatterplot)
# help(np.log)
# help(pd.DataFrame)

## Analyse category sell rate

In [None]:
plt.figure(figsize=(28,20))
# category_sells = 
frame = data_joined.loc[:,["date_block_num", "item_category_id", "item_cnt_day"]]
frame = frame.set_index(["date_block_num"]).sort_values("date_block_num")

category_matrix = frame.groupby(["date_block_num", "item_category_id"]).item_cnt_day.sum().unstack(-1).fillna(0)

In [None]:
plt.figure(figsize=(28,20))

category_matrix

In [None]:
plt.figure(figsize=(28,20))

sns.lineplot(data=category_matrix)

## Analyse shop sell rate

In [None]:
plt.figure(figsize=(28,20))
# category_sells = 
frame = data_joined.loc[:,["date_block_num", "shop_id", "item_cnt_day"]]
frame = frame.set_index(["date_block_num"]).sort_values("date_block_num")

shop_stat = frame.groupby(["date_block_num", "shop_id"]).item_cnt_day.sum().unstack(-1).fillna(0)

In [None]:
plt.figure(figsize=(28,20))

sns.lineplot(data=shop_stat)

In [None]:
plt.figure(figsize=(25,14))

cat_corr = category_matrix.corr()

sns.heatmap(cat_corr)

In [None]:
plt.figure(figsize=(25,14))

shop_corr = shop_stat.corr()

sns.heatmap(shop_corr)

## Absolute correlation in shop sale time series

In [None]:
abs_shop = shop_corr.abs()

plt.figure(figsize=(25,14))

sns.heatmap(abs_shop)

## Absolute correlation in gategory sale time series

In [None]:
abs_cat = cat_corr.abs()

plt.figure(figsize=(25,14))

sns.heatmap(abs_cat)