In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel("../data/raw/Online Retail.xlsx")

# Quick overview
print("Shape:", df.shape)
print(df.info())
display(df.head())

# Check for nulls
print(df.isnull().sum())

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Basic sales amount column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Convert InvoiceDate to datetime if not already
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Extract month-year
df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')

# Group sales by month
monthly_sales = df.groupby('YearMonth')['TotalPrice'].sum()

# Plot sales trend
monthly_sales.plot(kind='line', figsize=(12, 5), title="Monthly Sales Trend")
plt.ylabel("Sales (£)")
plt.grid(True)
plt.show()


import os

# Create processed data folder if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

# Filtered + cleaned DataFrame (assuming your clean DataFrame is `df`)
# If you're using a different variable name (like df_clean), replace accordingly

# Drop rows with negative or zero quantities/prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# Drop NA customer IDs
df = df.dropna(subset=["CustomerID"])

# Create new target variable
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Reset index
df = df.reset_index(drop=True)

# Save to CSV
df.to_csv("../data/processed/retail_clean.csv", index=False)

print("Cleaned dataset saved to: ../data/processed/retail_clean.csv")



: 

In [None]:
df.head()


: 