# RETAIL STORE SALES ANALYSIS - SECTION B
## Data Transformation (Questions 4-5)

This notebook covers:
- Q4: Feature Engineering (Revenue Category, Is_Online)
- Q5: Categorization (Unique counts)

## Import Libraries and Load Cleaned Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Construct path to cleaned data
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
cleaned_path = os.path.join(project_root, 'data', 'processed', 'cleaned_sales_data.csv')

df = pd.read_csv(cleaned_path)

print(f"Cleaned dataset loaded!")
print(f"Shape: {df.shape[0]} rows Ã— {df.shape[1]} columns")
print("\nFirst 5 rows:")
df.head()

# Q4: Feature Engineering
## Create Revenue Category and Is_Online fields

## Analyze Total Spent Distribution

In [None]:
# Understand the distribution for categorizing
print("TOTAL SPENT DISTRIBUTION")
print("="*50)
print(f"Min: ${df['Total Spent'].min():.2f}")
print(f"Max: ${df['Total Spent'].max():.2f}")
print(f"Mean: ${df['Total Spent'].mean():.2f}")
print(f"Median: ${df['Total Spent'].median():.2f}")
print(f"\nPercentiles:")
for p in [25, 50, 75, 90, 95]:
    print(f"{p}th percentile: ${df['Total Spent'].quantile(p/100):.2f}")

# Visualize distribution
plt.figure(figsize=(10, 5))
plt.hist(df['Total Spent'], bins=50, edgecolor='black', alpha=0.7)
plt.axvline(100, color='red', linestyle='--', label='Low/Medium boundary ($100)')
plt.axvline(300, color='green', linestyle='--', label='Medium/High boundary ($300)')
plt.xlabel('Total Spent ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Transaction Amounts')
plt.legend()
plt.show()

## Create Revenue Category

In [None]:
# Define categorization function
def categorize_revenue(amount):
    if amount < 100:
        return 'Low (< 100)'
    elif amount <= 300:
        return 'Medium (100â€“300)'
    else:
        return 'High (> 300)'

# Apply function
df['Revenue Category'] = df['Total Spent'].apply(categorize_revenue)

print("REVENUE CATEGORY CREATED")
print("="*50)
print("\nCategory distribution:")
category_counts = df['Revenue Category'].value_counts()
for cat, count in category_counts.items():
    print(f"  {cat}: {count:,} transactions ({count/len(df)*100:.1f}%)")
    print(f"    Total value: ${df[df['Revenue Category']==cat]['Total Spent'].sum():,.2f}")

# Visualize
plt.figure(figsize=(8, 5))
colors = ['#ff9999', '#66b3ff', '#99ff99']
category_counts.plot(kind='bar', color=colors, edgecolor='black')
plt.title('Transactions by Revenue Category')
plt.xlabel('Revenue Category')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45)
for i, v in enumerate(category_counts.values):
    plt.text(i, v + 5, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

## Create Is_Online Field

In [None]:
# Create Is_Online flag based on Location
df['Is_Online'] = df['Location'].apply(lambda x: 1 if x == 'Online' else 0)

print("IS_ONLINE FIELD CREATED")
print("="*50)
print("Location distribution:")
location_dist = df['Location'].value_counts()
for loc, count in location_dist.items():
    print(f"  {loc}: {count:,} transactions ({count/len(df)*100:.1f}%)")

print(f"\nOnline transactions: {df['Is_Online'].sum():,}")
print(f"In-store transactions: {len(df) - df['Is_Online'].sum():,}")

# Visualize
plt.figure(figsize=(6, 6))
plt.pie(location_dist.values, labels=location_dist.index, autopct='%1.1f%%', 
        colors=['lightgreen', 'lightblue'], startangle=90)
plt.title('Online vs In-Store Transactions')
plt.show()

# Q5: Categorization
## Count number of unique values

## Count Unique Values

In [None]:
# Calculate unique counts
unique_customers = df['Customer ID'].nunique()
unique_categories = df['Category'].nunique()
unique_payments = df['Payment Method'].nunique()
unique_items = df['Item'].nunique()

print("UNIQUE VALUE COUNTS")
print("="*50)
print(f"Unique Customers: {unique_customers:,}")
print(f"Unique Categories: {unique_categories}")
print(f"Unique Payment Methods: {unique_payments}")
print(f"Unique Items: {unique_items:,}")

# Display the unique values
print("\n" + "="*50)
print("DETAILED BREAKDOWN")
print("="*50)

print("\nðŸ“¦ CATEGORIES:")
for i, cat in enumerate(sorted(df['Category'].unique()), 1):
    cat_count = df[df['Category'] == cat].shape[0]
    cat_revenue = df[df['Category'] == cat]['Total Spent'].sum()
    print(f"  {i:2d}. {cat:<30} {cat_count:4d} transactions  ${cat_revenue:8,.0f}")

## Payment Methods Detail

In [None]:
print("\nðŸ’³ PAYMENT METHODS:")
payment_stats = df.groupby('Payment Method').agg({
    'Transaction ID': 'count',
    'Total Spent': 'sum'
}).round(2)
payment_stats.columns = ['Transaction Count', 'Total Revenue']
payment_stats['% of Transactions'] = (payment_stats['Transaction Count'] / len(df) * 100).round(1)
payment_stats['% of Revenue'] = (payment_stats['Total Revenue'] / df['Total Spent'].sum() * 100).round(1)

payment_stats = payment_stats.sort_values('Transaction Count', ascending=False)
for method, stats in payment_stats.iterrows():
    print(f"  â€¢ {method:<15} {stats['Transaction Count']:4d} trans  ${stats['Total Revenue']:8,.0f}  "
          f"({stats['% of Transactions']}% trans, {stats['% of Revenue']}% revenue)")

## Items Summary

In [None]:
print("\nðŸ›’ ITEMS SUMMARY:")
print(f"Total unique items: {unique_items:,}")
print("\nTop 10 most frequent items:")
top_items = df['Item'].value_counts().head(10)
for item, count in top_items.items():
    item_revenue = df[df['Item'] == item]['Total Spent'].sum()
    print(f"  â€¢ {item:<15} {count:3d} times  ${item_revenue:7,.0f}")

print("\nBottom 10 least frequent items:")
bottom_items = df['Item'].value_counts().tail(10)
for item, count in bottom_items.items():
    item_revenue = df[df['Item'] == item]['Total Spent'].sum()
    print(f"  â€¢ {item:<15} {count:3d} time   ${item_revenue:6,.0f}")

## Save Transformed Data

In [None]:
# Save the transformed dataset
output_dir = os.path.join(project_root, 'data', 'processed')
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'transformed_sales_data.csv')
df.to_csv(output_path, index=False)

print(f"âœ… Transformed data saved to: {output_path}")
print(f"Dataset now has {df.shape[1]} columns (added: Revenue Category, Is_Online)")

# SECTION B SUMMARY

âœ… **Q4 Completed:**
- Created Revenue Category (Low/Medium/High)
- Created Is_Online field

âœ… **Q5 Completed:**
- Unique Customers: {unique_customers:,}
- Unique Categories: {unique_categories}
- Unique Payment Methods: {unique_payments}
- Unique Items: {unique_items:,}

**Ready to proceed to Section C: Data Analysis**