In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import os

# Load the data
customers = pd.read_csv('../data/Customers.csv')
products = pd.read_csv('../data/Products.csv')
transactions = pd.read_csv('../data/Transactions.csv')

# Create outputs folder if it doesn't exist
output_dir = '../outputs/'
os.makedirs(output_dir, exist_ok=True)

# Create a PDF class object
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(200, 10, 'Task 1: Exploratory Data Analysis (EDA) and Business Insights', ln=True, align='C')
        self.ln(10)  # Line break

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1)
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln(10)

    def add_plot(self, plot_filename):
        self.image(plot_filename, x=10, y=None, w=180)
        self.ln(85)  # Adjust space for plot image

# Create an instance of the PDF class
pdf = PDF()
pdf.add_page()

# Add insights and plots to PDF

# 1. General data summary
pdf.chapter_title('1. General Data Summary')
summary = f"""
Number of Customers: {customers.shape[0]}
Number of Products: {products.shape[0]}
Number of Transactions: {transactions.shape[0]}

Data Summary for Customers:
{customers.describe()}

Data Summary for Products:
{products.describe()}

Data Summary for Transactions:
{transactions.describe()}
"""
pdf.chapter_body(summary)

# 2. Number of unique customers, products, and transactions
num_customers = customers['CustomerID'].nunique()
num_products = products['ProductID'].nunique()
num_transactions = transactions['TransactionID'].nunique()
pdf.chapter_title('2. Unique Counts')
pdf.chapter_body(f"Number of Unique Customers: {num_customers}\n"
                 f"Number of Unique Products: {num_products}\n"
                 f"Number of Transactions: {num_transactions}\n")

# 3. Top-Selling Products (Bar Chart)
top_selling = transactions.groupby('ProductID').agg(
    total_sales=('TotalValue', 'sum'),
    num_sales=('TransactionID', 'count')
).reset_index().sort_values(by='num_sales', ascending=False).head(10)

top_products_plot_path = os.path.join(output_dir, 'top_selling_products.png')
plt.figure(figsize=(10, 6))
sns.barplot(x='num_sales', y='ProductID', data=top_selling, palette='viridis')
plt.title('Top 10 Most Purchased Products')
plt.xlabel('Number of Purchases')
plt.ylabel('Product ID')
plt.savefig(top_products_plot_path)
plt.close()
pdf.chapter_title('3. Top-Selling Products')
pdf.add_plot(top_products_plot_path)
pdf.chapter_body(
    "The bar chart titled 'Top 10 Most Purchased Products' shows that the 'ActiveWear Smartwatch' leads in sales, "
    "followed by 'SoundWave Headphones' and 'BookWorld Biography.' This data is essential for managing inventory, "
    "refining marketing strategies, and considering potential product expansions."
)

# 4. Regional Market Share (Pie Chart)
customer_region_dist = customers['Region'].value_counts().reset_index()
customer_region_dist.columns = ['Region', 'Count']
region_plot_path = os.path.join(output_dir, 'customer_distribution_by_region.png')
plt.figure(figsize=(8, 8))
plt.pie(customer_region_dist['Count'], labels=customer_region_dist['Region'], autopct='%1.1f%%', startangle=140)
plt.title('Customer Distribution by Region')
plt.savefig(region_plot_path)
plt.close()
pdf.chapter_title('4. Regional Market Share')
pdf.add_plot(region_plot_path)
pdf.chapter_body(
    "The pie chart 'Customer Distribution by Region' reveals that 'South America' holds the largest share of "
    "customers, making up around 29.5% of the total. This insight can inform targeted marketing efforts, regional "
    "growth initiatives, and inventory planning based on demand in different areas."
)

# 5. Seasonal Sales Trends (Line Graph)
monthly_sales = transactions.groupby(transactions['TransactionDate'].str[:7]).agg(
    total_sales=('TotalValue', 'sum')
).reset_index()
monthly_sales['TransactionDate'] = pd.to_datetime(monthly_sales['TransactionDate'])

seasonal_sales_plot_path = os.path.join(output_dir, 'monthly_sales_trends.png')
plt.figure(figsize=(10, 6))
sns.lineplot(x='TransactionDate', y='total_sales', data=monthly_sales, marker='o')
plt.title('Monthly Sales Trends')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.savefig(seasonal_sales_plot_path)
plt.close()
pdf.chapter_title('5. Seasonal Sales Trends')
pdf.add_plot(seasonal_sales_plot_path)
pdf.chapter_body(
    "The line graph 'Monthly Sales Trends' displays a fluctuating sales pattern, with a notable spike in January 2024, "
    "followed by a dip and another peak in July. This trend can help forecast future sales cycles, plan for seasonal "
    "promotions, and optimize inventory levels to align with demand."
)

# 6. Product Category Popularity (Insights)
pdf.chapter_title('6. Product Category Popularity')
pdf.chapter_body(
    "While not directly visualized, the data on product categories offers valuable insights into which types of products "
    "are most popular and generate the highest revenue. This can guide decisions about refining the product range, reallocating "
    "marketing resources, and spotting new opportunities for growth."
)

# 7. Customer Lifetime Value (CLTV) (Insights)
pdf.chapter_title('7. Customer Lifetime Value (CLTV) Insight')
pdf.chapter_body(
    "Although CLTV isnâ€™t explicitly calculated, the available data on customer sign-up dates and purchase history can be "
    "analyzed to estimate CLTV across different regions. This can help identify high-value customer segments, create targeted "
    "loyalty programs, and enhance customer retention strategies to drive revenue growth."
)

# Save the PDF to the outputs folder
pdf_output_path = os.path.join(output_dir, 'Task1_EDA_Report.pdf')
pdf.output(pdf_output_path)

print(f"PDF report generated: {pdf_output_path}")
