In [1]:

from sqlalchemy import create_engine
import pandas as pd

engine = create_engine("mysql+pymysql://root:1234qwert@localhost:3306/retail_analytics")

df = pd.read_sql("SELECT * FROM cleaned_sales", engine)
print(df.columns.tolist())


['ordernumber', 'quantityordered', 'priceeach', 'orderlinenumber', 'sales', 'orderdate', 'status', 'qtr_id', 'month_id', 'year_id', 'productline', 'msrp', 'productcode', 'customername', 'phone', 'addressline1', 'addressline2', 'city', 'state', 'postalcode', 'country', 'territory', 'contactlastname', 'contactfirstname', 'dealsize', 'cost_each', 'profit_est', 'discount_pct', 'inventory_age_days', 'is_profitable']


In [2]:
df.columns = [c.strip().lower() for c in df.columns]

In [3]:
sku_terr = df.groupby(['productcode','territory']).agg(
    total_sales=('sales','sum'),
    total_profit=('profit_est','sum'),
    avg_margin=('profit_est','mean'),
    total_qty=('quantityordered','sum'),
    order_count=('ordernumber','nunique')
).reset_index()

sku_terr['is_loss_leader'] = (sku_terr['total_profit'] < 0) | (sku_terr['avg_margin'] < 0.05)

In [4]:
print(df.dtypes)
print(df['orderdate'].head(10))


ordernumber            object
quantityordered         int64
priceeach             float64
orderlinenumber         int64
sales                 float64
orderdate              object
status                 object
qtr_id                  int64
month_id                int64
year_id                 int64
productline            object
msrp                  float64
productcode            object
customername           object
phone                  object
addressline1           object
addressline2           object
city                   object
state                  object
postalcode             object
country                object
territory              object
contactlastname        object
contactfirstname       object
dealsize               object
cost_each             float64
profit_est            float64
discount_pct          float64
inventory_age_days    float64
is_profitable           int64
dtype: object
0    2003-01-06
1    2003-01-06
2    2003-01-06
3    2003-01-06
4    2003-01-09
5    2

In [5]:
df['orderdate'] = pd.to_datetime(df['orderdate'], errors='coerce', infer_datetime_format=True)
print("After conversion:", df['orderdate'].dtypes)

After conversion: datetime64[ns]


  df['orderdate'] = pd.to_datetime(df['orderdate'], errors='coerce', infer_datetime_format=True)


In [6]:
monthly = (
    df.assign(order_month=df['orderdate'].dt.to_period('M').dt.to_timestamp())
      .groupby(['territory', 'order_month'])['sales'].sum()
      .reset_index()
      .rename(columns={'order_month': 'ds', 'sales': 'y'})
)


In [8]:
import numpy as np
elast_df = df.copy()
elast_df['profit_margin'] = np.where(elast_df['sales']>0, elast_df['profit_est']/elast_df['sales'], 0)


In [9]:
df.to_csv("Data1/cleaned_sales.csv", index=False)
sku_terr.to_csv("Data1/sku_territory_features.csv", index=False)
monthly.to_csv("Data1/monthly_sales.csv", index=False)
elast_df.to_csv("Data1/elasticity_data.csv", index=False)
