# RETAIL STORE SALES ANALYSIS - SECTION C
## Data Analysis (Questions 6-8)

This notebook covers:
- Q6: Sales Trend Analysis
- Q7: Customer Insights
- Q8: Payment & Channel Analysis

## Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
transformed_path = os.path.join(project_root, 'data', 'processed', 'transformed_sales_data.csv')

df = pd.read_csv(transformed_path)
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])

print(f"Dataset loaded!")
print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"Date range: {df['Transaction Date'].min().date()} to {df['Transaction Date'].max().date()}")

# Q6: Sales Trend Analysis
## Calculate total sales per year, month, category and identify top items

## Total Sales per Year

In [None]:
print("üìä TOTAL SALES PER YEAR")
print("="*50)

sales_per_year = df.groupby('Year')['Total Spent'].agg(['sum', 'mean', 'count']).round(2)
sales_per_year.columns = ['Total Revenue', 'Avg Transaction', 'Transaction Count']

for year, row in sales_per_year.iterrows():
    print(f"\n{int(year)}:")
    print(f"  Revenue: ${row['Total Revenue']:,.2f}")
    print(f"  Transactions: {int(row['Transaction Count']):,}")
    print(f"  Average: ${row['Avg Transaction']:.2f}")

# Visualize
plt.figure(figsize=(10, 5))
plt.bar(sales_per_year.index.astype(str), sales_per_year['Total Revenue'], color='skyblue', edgecolor='navy')
plt.title('Total Revenue by Year', fontsize=14, fontweight='bold')
plt.xlabel('Year')
plt.ylabel('Revenue ($)')
for i, (year, row) in enumerate(sales_per_year.iterrows()):
    plt.text(i, row['Total Revenue'] + 500, f'${row["Total Revenue"]:,.0f}', 
             ha='center', fontweight='bold')
plt.show()

## Total Sales per Month

In [None]:
print("\nüìä TOTAL SALES PER MONTH")
print("="*50)

sales_per_month = df.groupby('Month_Name')['Total Spent'].sum().round(2)

month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
sales_per_month = sales_per_month.reindex(month_order)

print("Monthly revenue (all years combined):")
for month, revenue in sales_per_month.items():
    if not pd.isna(revenue):
        print(f"  {month:<10}: ${revenue:10,.2f}")

top_months = sales_per_month.nlargest(3)
print(f"\nüèÜ Peak months: {', '.join(top_months.index[:2])} and {top_months.index[2]}")

plt.figure(figsize=(12, 5))
plt.plot(range(1, 13), sales_per_month.values, marker='o', linewidth=2, markersize=8, color='crimson')
plt.xticks(range(1, 13), [m[:3] for m in month_order], rotation=45)
plt.title('Monthly Sales Trend (All Years Combined)', fontsize=14, fontweight='bold')
plt.xlabel('Month')
plt.ylabel('Total Revenue ($)')
plt.grid(True, alpha=0.3)
plt.show()

## Total Sales per Category

In [None]:
print("\nüìä TOTAL SALES PER CATEGORY")
print("="*50)

sales_per_category = df.groupby('Category')['Total Spent'].agg(['sum', 'mean', 'count']).round(2)
sales_per_category = sales_per_category.sort_values('sum', ascending=False)
sales_per_category.columns = ['Total Revenue', 'Avg Transaction', 'Transaction Count']

for cat, row in sales_per_category.iterrows():
    pct = (row['Total Revenue'] / sales_per_category['Total Revenue'].sum()) * 100
    print(f"\n{cat}:")
    print(f"  Revenue: ${row['Total Revenue']:10,.2f} ({pct:.1f}%)")
    print(f"  Transactions: {int(row['Transaction Count']):4d}")
    print(f"  Average: ${row['Avg Transaction']:.2f}")

plt.figure(figsize=(10, 6))
colors = plt.cm.Set3(np.linspace(0, 1, len(sales_per_category)))
plt.barh(sales_per_category.index, sales_per_category['Total Revenue'], color=colors)
plt.title('Revenue by Category', fontsize=14, fontweight='bold')
plt.xlabel('Total Revenue ($)')
plt.ylabel('Category')
for i, (cat, row) in enumerate(sales_per_category.iterrows()):
    plt.text(row['Total Revenue'] + 500, i, f'${row["Total Revenue"]:,.0f}', va='center')
plt.tight_layout()
plt.show()

## Top 5 Revenue-Generating Items

In [None]:
print("\nüèÜ TOP 5 REVENUE-GENERATING ITEMS")
print("="*50)

top_items = df.groupby('Item').agg({
    'Total Spent': ['sum', 'mean', 'count']
}).round(2)
top_items.columns = ['Total Revenue', 'Avg Transaction', 'Transaction Count']
top_items = top_items.sort_values('Total Revenue', ascending=False).head(5)

for i, (item, row) in enumerate(top_items.iterrows(), 1):
    revenue_pct = (row['Total Revenue'] / df['Total Spent'].sum()) * 100
    print(f"\n{i}. {item}")
    print(f"   Revenue: ${row['Total Revenue']:10,.2f} ({revenue_pct:.1f}% of total)")
    print(f"   Transactions: {int(row['Transaction Count']):4d}")
    print(f"   Average: ${row['Avg Transaction']:.2f}")

plt.figure(figsize=(10, 5))
plt.bar(range(1, 6), top_items['Total Revenue'], color=['gold', 'silver', '#cd7f32', 'skyblue', 'lightgreen'])
plt.xticks(range(1, 6), [item[:20] + '...' if len(item) > 20 else item for item in top_items.index], rotation=45, ha='right')
plt.title('Top 5 Revenue-Generating Items', fontsize=14, fontweight='bold')
plt.xlabel('Item')
plt.ylabel('Total Revenue ($)')
for i, (item, row) in enumerate(top_items.iterrows(), 1):
    plt.text(i, row['Total Revenue'] + 50, f'${row["Total Revenue"]:,.0f}', ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

# Q7: Customer Insights
## Average spending, highest spender, discount analysis

## Average Spending per Customer

In [None]:
print("üë• CUSTOMER SPENDING ANALYSIS")
print("="*50)

customer_stats = df.groupby('Customer ID').agg({
    'Total Spent': ['sum', 'mean', 'count'],
    'Discount Applied': 'mean'
}).round(2)

customer_stats.columns = ['Lifetime Value', 'Avg Transaction', 'Transaction Count', 'Discount Rate']

print(f"\nAverage spending per customer:")
print(f"  Mean lifetime value: ${customer_stats['Lifetime Value'].mean():,.2f}")
print(f"  Median lifetime value: ${customer_stats['Lifetime Value'].median():,.2f}")
print(f"  Range: ${customer_stats['Lifetime Value'].min():.2f} - ${customer_stats['Lifetime Value'].max():,.2f}")

print(f"\nAverage transaction per customer:")
print(f"  Mean: ${customer_stats['Avg Transaction'].mean():.2f}")
print(f"  Median: ${customer_stats['Avg Transaction'].median():.2f}")

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(customer_stats['Lifetime Value'], bins=30, edgecolor='black', alpha=0.7, color='purple')
plt.axvline(customer_stats['Lifetime Value'].mean(), color='red', linestyle='--', label=f"Mean: ${customer_stats['Lifetime Value'].mean():.0f}")
plt.axvline(customer_stats['Lifetime Value'].median(), color='blue', linestyle='--', label=f"Median: ${customer_stats['Lifetime Value'].median():.0f}")
plt.xlabel('Lifetime Value ($)')
plt.ylabel('Number of Customers')
plt.title('Distribution of Customer Lifetime Value')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(customer_stats['Transaction Count'], bins=20, edgecolor='black', alpha=0.7, color='teal')
plt.xlabel('Number of Transactions')
plt.ylabel('Number of Customers')
plt.title('Transaction Frequency Distribution')
plt.tight_layout()
plt.show()

## Customer with Highest Lifetime Spending

In [None]:
print("\nüèÜ TOP CUSTOMERS")
print("="*50)

top_customers = customer_stats.sort_values('Lifetime Value', ascending=False).head(10)

print("\nTop 10 customers by lifetime spending:")
for i, (cust, row) in enumerate(top_customers.iterrows(), 1):
    print(f"\n{i}. {cust}")
    print(f"   Lifetime Value: ${row['Lifetime Value']:10,.2f}")
    print(f"   Transactions: {int(row['Transaction Count']):4d}")
    print(f"   Avg Transaction: ${row['Avg Transaction']:7,.2f}")
    print(f"   Discount Rate: {row['Discount Rate']*100:.1f}%")

top_customer = customer_stats['Lifetime Value'].idxmax()
top_value = customer_stats.loc[top_customer, 'Lifetime Value']
top_transactions = int(customer_stats.loc[top_customer, 'Transaction Count'])

print(f"\n{'‚≠ê'*20}")
print(f"üèÜ CUSTOMER OF THE YEAR: {top_customer}")
print(f"   Total spent: ${top_value:,.2f}")
print(f"   Number of purchases: {top_transactions}")
print(f"{'‚≠ê'*20}")

## Discount Analysis

In [None]:
print("\nüí∞ DISCOUNT ANALYSIS")
print("="*50)

total_discounted = df['Discount Applied'].sum()
discount_pct = (total_discounted / len(df)) * 100

print(f"Transactions with discount: {int(total_discounted):,} ({discount_pct:.1f}%)")
print(f"Transactions without discount: {len(df) - int(total_discounted):,} ({100-discount_pct:.1f}%)")

avg_discounted = df[df['Discount Applied']]['Total Spent'].mean()
avg_regular = df[~df['Discount Applied']]['Total Spent'].mean()

print(f"\nAverage transaction value:")
print(f"  With discount: ${avg_discounted:.2f}")
print(f"  Without discount: ${avg_regular:.2f}")
print(f"  Difference: ${avg_regular - avg_discounted:.2f} ({(avg_regular/avg_discounted - 1)*100:.1f}% higher without discount)")

discount_by_category = df.groupby('Category')['Discount Applied'].mean().sort_values(ascending=False) * 100

print(f"\nDiscount rate by category:")
for cat, rate in discount_by_category.items():
    print(f"  {cat:<30}: {rate:.1f}%")

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.pie([total_discounted, len(df)-total_discounted], 
        labels=['Discounted', 'Regular Price'],
        autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'], startangle=90)
plt.title('Transactions with Discount')

plt.subplot(1, 2, 2)
discount_by_category.plot(kind='barh', color='skyblue', edgecolor='navy')
plt.xlabel('Discount Rate (%)')
plt.title('Discount Rate by Category')
plt.tight_layout()
plt.show()

# Q8: Payment & Channel Analysis
## Payment method revenue and Online vs In-Store comparison

## Payment Method Analysis

In [None]:
print("üí≥ PAYMENT METHOD ANALYSIS")
print("="*50)

payment_analysis = df.groupby('Payment Method').agg({
    'Total Spent': ['sum', 'mean', 'count']
}).round(2)
payment_analysis.columns = ['Total Revenue', 'Avg Transaction', 'Transaction Count']
payment_analysis = payment_analysis.sort_values('Total Revenue', ascending=False)

print("Revenue by payment method:")
total_revenue = payment_analysis['Total Revenue'].sum()
for method, row in payment_analysis.iterrows():
    revenue_pct = (row['Total Revenue'] / total_revenue) * 100
    trans_pct = (row['Transaction Count'] / len(df)) * 100
    print(f"\n  {method}:")
    print(f"    Revenue: ${row['Total Revenue']:10,.2f} ({revenue_pct:.1f}%)")
    print(f"    Transactions: {int(row['Transaction Count']):4d} ({trans_pct:.1f}%)")
    print(f"    Average: ${row['Avg Transaction']:7,.2f}")

top_payment = payment_analysis.index[0]
top_payment_revenue = payment_analysis.loc[top_payment, 'Total Revenue']

print(f"\n{'üí≥'*20}")
print(f"üèÜ TOP PAYMENT METHOD: {top_payment}")
print(f"   Revenue: ${top_payment_revenue:,.2f}")
print(f"{'üí≥'*20}")

## Online vs In-Store Analysis

In [None]:
print("\nüåê ONLINE VS IN-STORE ANALYSIS")
print("="*50)

channel_analysis = df.groupby('Location').agg({
    'Total Spent': ['sum', 'mean', 'median', 'count']
}).round(2)
channel_analysis.columns = ['Total Revenue', 'Avg Transaction', 'Median Transaction', 'Transaction Count']

total_rev = channel_analysis['Total Revenue'].sum()
total_trans = channel_analysis['Transaction Count'].sum()

print("\nChannel performance:")
for location, row in channel_analysis.iterrows():
    revenue_pct = (row['Total Revenue'] / total_rev) * 100
    trans_pct = (row['Transaction Count'] / total_trans) * 100
    print(f"\n  {location}:")
    print(f"    Revenue: ${row['Total Revenue']:10,.2f} ({revenue_pct:.1f}%)")
    print(f"    Transactions: {int(row['Transaction Count']):5d} ({trans_pct:.1f}%)")
    print(f"    Average: ${row['Avg Transaction']:7,.2f}")
    print(f"    Median: ${row['Median Transaction']:7,.2f}")

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.bar(channel_analysis.index, channel_analysis['Total Revenue'], color=['lightblue', 'lightgreen'])
plt.title('Revenue by Channel')
plt.ylabel('Revenue ($)')
for i, (loc, row) in enumerate(channel_analysis.iterrows()):
    plt.text(i, row['Total Revenue'] + 500, f'${row["Total Revenue"]:,.0f}', ha='center', fontweight='bold')

plt.subplot(1, 2, 2)
plt.bar(channel_analysis.index, channel_analysis['Transaction Count'], color=['lightblue', 'lightgreen'])
plt.title('Transaction Volume by Channel')
plt.ylabel('Number of Transactions')
for i, (loc, row) in enumerate(channel_analysis.iterrows()):
    plt.text(i, row['Transaction Count'] + 5, f'{int(row["Transaction Count"])}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# SECTION C SUMMARY

‚úÖ **Q6 Completed: Sales Trend Analysis**
- Analyzed sales by year, month, and category
- Identified top 5 revenue-generating items

‚úÖ **Q7 Completed: Customer Insights**
- Average customer spend calculated
- Top customer identified
- Discount analysis complete

‚úÖ **Q8 Completed: Payment & Channel Analysis**
- Top payment method identified
- Online vs In-store comparison complete

**Ready to proceed to Section D: Visualizations**