In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

### 1 Read data

In [85]:
df = pd.read_csv("../data/raw/df.csv", index_col=0)

### 2 EDA

**Features description:**
- date — Date of sale.
- sku_id — Stock Keeping Unit; unique identifier for each product.
- sales_quantity — Number of units sold on that date.
- sales_price — Average sales price per unit on that date.
- category_id — Category code of the product.

#### 2.1 General information

In [86]:
# Show data
display(df.head(3))
display(df.tail(3))

Unnamed: 0,date,category_id,sku_id,sales_price,sales_quantity
78161,2016-11-25,7,1108,457.6,1.0
81079,2016-11-18,7,2999,470.6,1.0
81177,2016-11-18,7,1428,717.6,1.0


Unnamed: 0,date,category_id,sku_id,sales_price,sales_quantity
20778611,2020-10-14,7,723588,401.7,3.0
20779790,2020-10-17,7,792124,778.7,1.0
20794630,2020-10-23,7,798049,219.7,1.0


In [87]:
# Date to datatime format
df["date"] = pd.to_datetime(df["date"])

# Sort by date and reset index
df = df.sort_values("date").reset_index(drop=True)

In [88]:
print("Shape:", df.shape)
print("DataTypes:")
print(df.dtypes)

Shape: (226486, 5)
DataTypes:
date              datetime64[ns]
category_id                int64
sku_id                     int64
sales_price              float64
sales_quantity           float64
dtype: object


In [98]:
print("Unique values per column:")
print(df.nunique())

Unique values per column:
date              1420
category_id          3
sku_id            1059
sales_price       1373
sales_quantity     979
dtype: int64


In [89]:
print("Missing values:")
df.isnull().sum()

Missing values:


date              0
category_id       0
sku_id            0
sales_price       0
sales_quantity    0
dtype: int64

In [90]:
print("Duplicate rows:", df.duplicated().sum())
print("Unique dates:", df['date'].nunique())

Duplicate rows: 0
Unique dates: 1420


In [91]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2016-11-18 00:00:00
Max date: 2020-10-31 00:00:00


### 2.2 