# P1 — Retail Sales Analytics : 01_eda

In [None]:
import pandas as pd, numpy as np
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt

DB = 'sqlite:///../../da4.db'
TABLE = 'p1_sales_stage'

engine = create_engine(DB)
with engine.begin() as conn:
    df = pd.read_sql(text(f'SELECT * FROM {TABLE}'), conn)

print('Shape:', df.shape)
display(df.head())
display(df.describe(numeric_only=True).T)
print('\nNull % (top):')
print((df.isna().mean()*100).sort_values(ascending=False).head(15))


# --- Basic checks
df['order_date'] = pd.to_datetime(df['order_date'])
print('Date range:', df['order_date'].min(), '→', df['order_date'].max())
print('Duplicated order_id rows:', df.duplicated(['order_id','product_id']).sum())


# --- Monthly revenue
mrev = (df.assign(ym=df['order_date'].dt.to_period('M').astype(str))
          .groupby('ym', as_index=False)['revenue'].sum().sort_values('ym'))
plt.figure(); plt.plot(mrev['ym'], mrev['revenue'])
plt.xticks(rotation=90); plt.title('Monthly Revenue'); plt.tight_layout(); plt.show()

# --- Top sub-categories by gross profit
top_sub = (df.groupby(['category','sub_category'], as_index=False)['gross_profit']
             .sum().sort_values('gross_profit', ascending=False).head(10))
plt.figure(); plt.bar(top_sub['sub_category'], top_sub['gross_profit'])
plt.xticks(rotation=45, ha='right'); plt.title('Top Sub-categories by Gross Profit')
plt.tight_layout(); plt.show()
top_sub

# --- Region x Segment revenue pivot
pv = df.pivot_table(values='revenue', index='region', columns='customer_segment', aggfunc='sum', fill_value=0)
pv
