In [1]:
import pandas as pd

df = pd.read_csv("../data/regional_sales.csv")

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Shape: (8, 5)

First few rows:
   order_id region   product  quantity  price
0      2001  North  Keyboard       2.0   1500
1      2002  South     Mouse       1.0    500
2      2003  North   Monitor       1.0  12000
3      2004   East     Mouse       4.0    500
4      2005   West  Keyboard       1.0   1500

Data Types:
order_id      int64
region       object
product      object
quantity    float64
price         int64
dtype: object

Missing Values:
order_id    0
region      0
product     0
quantity    1
price       0
dtype: int64


In [2]:
clean_df = df.dropna(subset=["quantity"])
print(f"Rows before cleaning: {len(df)}")
print(f"Rows after cleaning: {len(clean_df)}")


Rows before cleaning: 8
Rows after cleaning: 7


In [3]:
clean_df["quantity"] = pd.to_numeric(clean_df["quantity"], errors='coerce')
clean_df["price"] = pd.to_numeric(clean_df["price"], errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["quantity"] = pd.to_numeric(clean_df["quantity"], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["price"] = pd.to_numeric(clean_df["price"], errors='coerce')


In [4]:
duplicates_before = clean_df.duplicated().sum()
clean_df = clean_df.drop_duplicates()
print(f"Duplicates removed: {duplicates_before}")

Duplicates removed: 0


In [5]:
print("\nCleaned Dataset:")
print(clean_df)
print(f"\nMissing values after cleaning:\n{clean_df.isnull().sum()}")


Cleaned Dataset:
   order_id region   product  quantity  price
0      2001  North  Keyboard       2.0   1500
1      2002  South     Mouse       1.0    500
2      2003  North   Monitor       1.0  12000
3      2004   East     Mouse       4.0    500
4      2005   West  Keyboard       1.0   1500
6      2007   East  Keyboard       3.0   1500
7      2008   West     Mouse       2.0    500

Missing values after cleaning:
order_id    0
region      0
product     0
quantity    0
price       0
dtype: int64


In [6]:
clean_df["revenue"] = clean_df["quantity"] * clean_df["price"]
print("Dataset with revenue:")
print(clean_df[["order_id", "region", "quantity", "price", "revenue"]])

Dataset with revenue:
   order_id region  quantity  price  revenue
0      2001  North       2.0   1500   3000.0
1      2002  South       1.0    500    500.0
2      2003  North       1.0  12000  12000.0
3      2004   East       4.0    500   2000.0
4      2005   West       1.0   1500   1500.0
6      2007   East       3.0   1500   4500.0
7      2008   West       2.0    500   1000.0


In [7]:
regional_summary = clean_df.groupby("region").agg({
    "revenue": "sum",
    "quantity": "sum"
}).reset_index()

regional_summary.columns = ["Region", "Total_Revenue", "Total_Quantity"]
print("\nRegional Summary:")
print(regional_summary)



Regional Summary:
  Region  Total_Revenue  Total_Quantity
0   East         6500.0             7.0
1  North        15000.0             3.0
2  South          500.0             1.0
3   West         2500.0             3.0


In [10]:
top_region = regional_summary.sort_values("Total_Revenue", ascending=False).iloc[0]
print(f"\nTop-Performing Region:")
print(f"Region: {top_region['Region']}")
print(f"Total Revenue: {top_region['Total_Revenue']:,.2f}")



Top-Performing Region:
Region: North
Total Revenue: 15,000.00


In [11]:
regional_summary_sorted = regional_summary.sort_values("Total_Revenue", ascending=False)
print("\nRegions Sorted by Revenue (High to Low):")
print(regional_summary_sorted)



Regions Sorted by Revenue (High to Low):
  Region  Total_Revenue  Total_Quantity
1  North        15000.0             3.0
0   East         6500.0             7.0
3   West         2500.0             3.0
2  South          500.0             1.0


In [12]:
low_performing = regional_summary[regional_summary["Total_Revenue"] < 5000]
print("\nLow-Performing Regions (Revenue < 5,000):")
print(low_performing)



Low-Performing Regions (Revenue < 5,000):
  Region  Total_Revenue  Total_Quantity
2  South          500.0             1.0
3   West         2500.0             3.0


In [None]:
regional_summary_sorted.to_csv("../output/regional_revenue_summary.csv", index=False)
print("Saved: regional_revenue_summary.csv")


Saved: regional_revenue_summary.csv


In [None]:


def analyze_regional_sales(input_file, revenue_threshold=5000):
    """
    Analyze regional sales performance and generate insights.
    
    Args:
        input_file (str): Path to the sales CSV file
        revenue_threshold (float): Threshold for low-performing regions
    
    Returns:
        tuple: (regional_summary, low_performing_regions)
    """
    # Load data
    df = pd.read_csv(input_file)
    print("Step 1: Dataset loaded")
    
    # Clean data
    clean_df = df.dropna(subset=["quantity"])
    clean_df["quantity"] = pd.to_numeric(clean_df["quantity"], errors='coerce')
    clean_df["price"] = pd.to_numeric(clean_df["price"], errors='coerce')
    clean_df = clean_df.drop_duplicates()
    print("Step 2: Data cleaned")
    
    # Compute revenue
    clean_df["revenue"] = clean_df["quantity"] * clean_df["price"]
    print("Step 3: Revenue calculated")
    
    # Aggregate by region
    regional_summary = clean_df.groupby("region").agg({
        "revenue": "sum",
        "quantity": "sum"
    }).reset_index()
    regional_summary.columns = ["Region", "Total_Revenue", "Total_Quantity"]
    regional_summary = regional_summary.sort_values("Total_Revenue", ascending=False)
    print("Step 4: Regional aggregations computed")
    
    # Identify low-performing regions
    low_performing = regional_summary[regional_summary["Total_Revenue"] < revenue_threshold]
    print("Step 5: Key insights identified")
    
    # Generate outputs
    regional_summary.to_csv("../output/regional_revenue_summary.csv", index=False)
    if len(low_performing) > 0:
        low_performing.to_csv("../output/low_performing_regions.csv", index=False)
    else:
        pd.DataFrame(columns=["Region", "Total_Revenue", "Total_Quantity"]).to_csv(
            "../output/low_performing_regions.csv", index=False
        )
    print("Step 6: Output files generated")
    
    # Display insights
    top_region = regional_summary.iloc[0]
    print("\n=== Key Insights ===")
    print(f"Top-Performing Region: {top_region['Region']} (Revenue: {top_region['Total_Revenue']:,.2f})")
    print(f"Low-Performing Regions: {len(low_performing)}")
    if len(low_performing) > 0:
        print("Regions needing attention:")
        for _, row in low_performing.iterrows():
            print(f"  - {row['Region']}: {row['Total_Revenue']:,.2f}")
    
    return regional_summary, low_performing

regional_summary, low_performing = analyze_regional_sales("../data/regional_sales.csv")