In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Task 1: Exploratory Data Analysis (EDA)
# Basic Information
print("Dataset Information:")
print(customers.info())
print(products.info())
print(transactions.info())

# Check for missing values
print("\nMissing Values:")
print("Customers:")
print(customers.isnull().sum())
print("Products:")
print(products.isnull().sum())
print("Transactions:")
print(transactions.isnull().sum())

# Descriptive statistics
print("\nDescriptive Statistics:")
print("Customers:")
print(customers.describe(include='all'))
print("Products:")
print(products.describe(include='all'))
print("Transactions:")
print(transactions.describe())

# Visualizations
print("\nGenerating visualizations...")

# 1. Customer distribution by region
plt.figure(figsize=(10, 6))
sns.countplot(data=customers, x='Region', palette='viridis')
plt.title("Customer Distribution by Region")
plt.xlabel("Region")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("customer_distribution_by_region.png", dpi=300)
plt.close()

# 2. Most popular product categories
plt.figure(figsize=(10, 6))
sns.countplot(data=products, y='Category', order=products['Category'].value_counts().index, palette='coolwarm')
plt.title("Product Categories Popularity")
plt.xlabel("Count")
plt.ylabel("Category")
plt.tight_layout()
plt.savefig("product_categories_popularity.png", dpi=300)
plt.close()

# 3. Total transaction value over time
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
transactions_by_date = transactions.groupby(transactions['TransactionDate'].dt.date)['TotalValue'].sum().reset_index()
plt.figure(figsize=(12, 8))
plt.plot(transactions_by_date['TransactionDate'], transactions_by_date['TotalValue'], marker='o')
plt.title("Total Transaction Value Over Time")
plt.xlabel("Date")
plt.ylabel("Total Value (USD)")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("total_transaction_value_over_time.png", dpi=300)
plt.close()

# 4. Top 10 customers by transaction value
top_customers = transactions.groupby('CustomerID')['TotalValue'].sum().nlargest(10).reset_index()
top_customers = top_customers.merge(customers, on='CustomerID')
plt.figure(figsize=(10, 6))
sns.barplot(data=top_customers, x='TotalValue', y='CustomerName', palette='mako')
plt.title("Top 10 Customers by Transaction Value")
plt.xlabel("Total Value (USD)")
plt.ylabel("Customer Name")
plt.tight_layout()
plt.savefig("top_10_customers.png", dpi=300)
plt.close()

# Additional Visualization: Monthly Revenue Trend
merged_data = transactions.merge(products[['ProductID']], on='ProductID', how='left')
merged_data['Month'] = transactions['TransactionDate'].dt.to_period('M')
monthly_transactions = merged_data.groupby('Month')['TotalValue'].sum()
plt.figure(figsize=(12, 8))
monthly_transactions.plot(kind='line', marker='o', color='green')
plt.title('Monthly Revenue Trend')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.savefig('monthly_revenue_trend.png')
plt.close()

# Business Insights
business_insights = [
    "Region A has the highest number of customers, indicating a potential market focus area.",
    "Category X is the most popular, suggesting it should be prioritized in promotions and stock.",
    "Transaction values peak during specific months, hinting at seasonal trends.",
    "A small percentage of customers contribute to a large share of revenue (e.g., top 10 customers)."
]

# Save EDA insights and plots to a PDF
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'EDA Business Insights', 0, 1, 'C')

    def chapter_title(self, chapter_title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, chapter_title, 0, 1, 'L')
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_plot(self, image_path, title):
        self.add_page()
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'C')
        self.ln(10)
        self.image(image_path, x=10, y=30, w=180)

pdf = PDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)

# Adding insights
pdf.chapter_title("Business Insights")
for i, insight in enumerate(business_insights, 1):
    pdf.chapter_body(f"{i}. {insight}")

# Adding visualizations to PDF
plots = [
    ("customer_distribution_by_region.png", "Customer Distribution by Region"),
    ("product_categories_popularity.png", "Product Categories Popularity"),
    ("total_transaction_value_over_time.png", "Total Transaction Value Over Time"),
    ("top_10_customers.png", "Top 10 Customers by Transaction Value"),
    ("monthly_revenue_trend.png", "Monthly Revenue Trend")
]

for plot_path, plot_title in plots:
    pdf.add_plot(plot_path, plot_title)

pdf.output("FirstName_LastName_EDA.pdf")
print("\nEDA complete. Insights and visualizations saved to FirstName_LastName_EDA.pdf.")