In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys



In [40]:
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)
DATA_PATH=Path("data/sales_data.csv")

In [41]:
def clean_data(df):
    print("\nCleaning data...")

  
    df = df.drop_duplicates()

   
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    
    df["quantity"] = pd.to_numeric(df["quantity"], errors="coerce")
    df["price"] = pd.to_numeric(df["price"], errors="coerce")

   
    df["revenue"] = np.multiply(df["quantity"], df["price"])

    
    df = df.dropna()

    return df

In [42]:
def analyze_data(df):
    print("\nðŸ“Š Performing analysis...")

    total_revenue = np.sum(df["revenue"])
    avg_revenue = np.mean(df["revenue"])
    total_quantity = np.sum(df["quantity"])

    print("\nSummary Metrics")
    print("------------------------")
    print(f"Total Revenue      : {total_revenue:,.2f}")
    print(f"Average Revenue    : {avg_revenue:,.2f}")
    print(f"Total Quantity     : {total_quantity:,.0f}")

    return total_revenue, avg_revenue

In [43]:
def create_charts(df):

   
    top_products = (
        df.groupby("product")["revenue"]
        .sum()
        .sort_values(ascending=False)
        .head(5)
    )

    plt.figure(figsize=(6, 4))
    top_products.plot(kind="bar")
    plt.xlabel("Product")
    plt.ylabel("Revenue")
    plt.title("Top 5 Products by Revenue")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "top_products_bar.png")
    plt.close()


    # -------- Line Chart --------
    df["date"] = pd.to_datetime(df["date"])
    daily = df.groupby("date")["revenue"].sum()

    plt.figure(figsize=(6, 4))
    daily.plot(kind="line")
    plt.xlabel("Date")
    plt.ylabel("Revenue")
    plt.title("Daily Revenue Trend")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "daily_revenue_line.png")
    plt.close()


In [44]:
def save_data(df):
    df.to_csv(OUTPUT_DIR / "cleaned_sales_data.csv", index=False)

In [45]:
def main():

    df = pd.read_csv(DATA_PATH)
    df = clean_data(df)

    analyze_data(df)
    create_charts(df)
    save_data(df)

    print("\nâœ… Analysis Complete!")
    print("Files saved in /output folder")

In [38]:
main()


Cleaning data...

ðŸ“Š Performing analysis...

Summary Metrics
------------------------
Total Revenue      : 12,365,048.00
Average Revenue    : 123,650.48
Total Quantity     : 478

âœ… Analysis Complete!
Files saved in /output folder
