## Data Analysis

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = [10, 7]
plt.style.use('seaborn-v0_8')

import seaborn as sns
sns.set(style="darkgrid")

import numpy as np 
import pandas as pd 

## Constants

In [2]:
PROJECT_ROOT = Path("__file__").resolve().parents[1]

DATA_DPATH = PROJECT_ROOT / "data"
assert DATA_DPATH.exists()

## Data Loading 

In [None]:
df_fpath = DATA_DPATH / "source_data" / "product.csv"
df_raw = pd.read_csv(df_fpath, index_col=0)
df_raw.shape

In [None]:
df_raw.head()

In [None]:
df = df_raw.copy() 
df["timestamp"] = pd.to_datetime(df["timestamp"])

df.head()

In [None]:
df.info()

## Duplicates

In [None]:
df[df.duplicated()]

## Missing Values

In [None]:
df.isna().sum()

In [None]:
df.isna().sum() / len(df)

In [None]:
df = df.dropna()
df.shape

## Date Limits

In [None]:
df["timestamp"].describe()

## Target

In [None]:
df["quantity"].describe()

In [None]:
np.percentile(df["quantity"], 95), np.percentile(df["quantity"], 99)

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

sns.histplot(
    df, 
    x="quantity",
    kde=True, 
    bins=30, 
    log_scale=False,
    ax=axs[0]
)
axs[0].set_xlabel("Объём продаж")
axs[0].set_ylabel("Количество записей")
axs[0].set_title("Гистограмма распределения объёма продаж")

sns.boxplot(df["quantity"], ax=axs[1])
axs[1].set_ylabel("Количество записей")
axs[1].set_title("Boxplot объёма продаж")

plt.tight_layout()
plt.show()

## Viz - Raw Timestamps

In [None]:
plot_data = df.copy()
plot_data = plot_data.sort_values("timestamp")
plot_data = plot_data[plot_data["timestamp"] > "2020-03-15"]

plt.figure(figsize=(15, 6))
plt.plot("timestamp", "quantity", ".-", data=plot_data)
plt.show()

## Viz - Resampled by Day

In [None]:
resampled_df = df.resample("1D", on="timestamp").sum().reset_index()

# NOTE: just for study purposes, make target variable more realistic 
resampled_df["quantity"] /= 1000
resampled_df["quantity"] = resampled_df["quantity"].round()

resampled_df["timestamp"] = resampled_df["timestamp"].dt.strftime("%Y-%m-%d")
resampled_df.shape

In [None]:
resampled_df.head()

In [None]:
plot_data = resampled_df.copy()
plot_data = plot_data.sort_values("timestamp")

plot_data = plot_data[plot_data["timestamp"] > "2020-03-15"]

plt.figure(figsize=(15, 6))
plt.plot("timestamp", "quantity", ".-", data=plot_data)
plt.xticks(rotation=90)

plt.show()

In [19]:
output_dpath = DATA_DPATH / "preprocessed_data"
output_dpath.mkdir(parents=True, exist_ok=True)

resampled_df.to_csv(output_dpath / "resampled_product.csv")