In [1]:
!pip install pandas matplotlib seaborn scikit-learn reportlab

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import seaborn as sns
from datetime import datetime

In [4]:
sns.set(style="whitegrid")

In [5]:
customers_path = "/content/drive/My Drive/data/Customers.csv"
products_path = "/content/drive/My Drive/data/Products.csv"
transactions_path = "/content/drive/My Drive/data/Transactions.csv"

In [6]:
# Load datasets
customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

In [7]:
# Preview datasets
print("Customers Dataset:")
print(customers.head(), "\n")

print("Products Dataset:")
print(products.head(), "\n")

print("Transactions Dataset:")
print(transactions.head(), "\n")

Customers Dataset:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15 

Products Dataset:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31 

Transactions Dataset:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166 

In [8]:
# --- Data Cleaning ---
# Checking for missing values
print("Missing Values in Datasets:")
print("Customers:", customers.isnull().sum(), "\n")
print("Products:", products.isnull().sum(), "\n")
print("Transactions:", transactions.isnull().sum(), "\n")

# Check data types
print("Data Types:")
print("Customers:", customers.dtypes, "\n")
print("Products:", products.dtypes, "\n")
print("Transactions:", transactions.dtypes, "\n")

# Convert dates to datetime format
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"], errors='coerce')
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"], errors='coerce')

Missing Values in Datasets:
Customers: CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64 

Products: ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64 

Transactions: TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64 

Data Types:
Customers: CustomerID      object
CustomerName    object
Region          object
SignupDate      object
dtype: object 

Products: ProductID       object
ProductName     object
Category        object
Price          float64
dtype: object 

Transactions: TransactionID       object
CustomerID          object
ProductID           object
TransactionDate     object
Quantity             int64
TotalValue         float64
Price              float64
dtype: object 



In [9]:
# Generate PDF for Business Insights
pdf_path = "/content/drive/My Drive/outputs/Sameer_Ahmed_EDA.pdf"
pdf = canvas.Canvas(pdf_path, pagesize=letter)
pdf.setTitle("EDA and Business Insights")
pdf.setFont("Helvetica-Bold", 16)
pdf.drawString(100, 750, "EDA and Business Insights")
pdf.setFont("Helvetica", 12)

In [10]:
# 1. Customer Signups Over Time
customers["SignupYear"] = customers["SignupDate"].dt.year
signup_counts = customers["SignupYear"].value_counts().sort_index()

plt.figure(figsize=(10, 5))
sns.barplot(x=signup_counts.index, y=signup_counts.values, palette="viridis")
plt.title("Customer Signups Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Signups")
plt.savefig("/content/drive/My Drive/outputs/signup_counts.png")
plt.close()

pdf.drawString(100, 720, "1. Customer Signups Over Time:")
pdf.drawImage("/content/drive/My Drive/outputs/signup_counts.png", 100, 500, width=400, height=200)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=signup_counts.index, y=signup_counts.values, palette="viridis")


(1000, 500)

In [11]:
# 2. Most Popular Product Categories
category_counts = products["Category"].value_counts()

plt.figure(figsize=(10, 5))
sns.barplot(x=category_counts.index, y=category_counts.values, palette="magma")
plt.title("Most Popular Product Categories")
plt.xlabel("Category")
plt.ylabel("Count")
plt.savefig("/content/drive/My Drive/outputs/category_counts.png")
plt.close()

pdf.drawString(100, 460, "2. Most Popular Product Categories:")
pdf.drawImage("/content/drive/My Drive/outputs/category_counts.png", 100, 240, width=400, height=200)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=category_counts.index, y=category_counts.values, palette="magma")


(1000, 500)

In [12]:
# 3. Revenue Over Time
transactions["YearMonth"] = transactions["TransactionDate"].dt.to_period("M")
monthly_revenue = transactions.groupby("YearMonth")["TotalValue"].sum()

plt.figure(figsize=(10, 5))
monthly_revenue.plot(kind="line", marker="o", color="blue")
plt.title("Monthly Revenue Trend")
plt.xlabel("Month")
plt.ylabel("Total Revenue (USD)")
plt.grid(True)
plt.savefig("/content/drive/My Drive/outputs/monthly_revenue.png")
plt.close()

pdf.drawString(100, 220, "3. Monthly Revenue Trend:")
pdf.drawImage("/content/drive/My Drive/outputs/monthly_revenue.png", 100, 40, width=400, height=200)


(1000, 500)

In [13]:
pdf.showPage()

In [14]:
# 4. Customer Region Analysis
region_counts = customers["Region"].value_counts()

plt.figure(figsize=(8, 5))
region_counts.plot(kind="pie", autopct="%1.1f%%", startangle=140, colors=sns.color_palette("pastel"))
plt.title("Customer Distribution by Region")
plt.ylabel("")
plt.savefig("/content/drive/My Drive/outputs/customer_region_distribution.png")
plt.close()

pdf.drawString(100, 720, "4. Customer Distribution by Region:")
pdf.drawImage("/content/drive/My Drive/outputs/customer_region_distribution.png", 100, 500, width=400, height=200)


(800, 500)

In [15]:
# 5. Top Customers by Spending
customer_spending = transactions.groupby("CustomerID")["TotalValue"].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=customer_spending.index, y=customer_spending.values, palette="coolwarm")
plt.title("Top 10 Customers by Total Spending")
plt.xlabel("CustomerID")
plt.ylabel("Total Spending (USD)")
plt.xticks(rotation=45)
plt.savefig("/content/drive/My Drive/outputs/top_customers_spending.png")
plt.close()

pdf.drawString(100, 220, "5. Top 10 Customers by Total Spending:")
pdf.drawImage("/content/drive/My Drive/outputs/top_customers_spending.png", 100, 40, width=400, height=200)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=customer_spending.index, y=customer_spending.values, palette="coolwarm")


(1000, 500)

In [16]:
pdf.save()
print(f"Business insights PDF saved at {pdf_path}.")

# Save cleaned data
customers.to_csv("/content/drive/My Drive/data/Cleaned_Customers.csv", index=False)
products.to_csv("/content/drive/My Drive/data/Cleaned_Products.csv", index=False)
transactions.to_csv("/content/drive/My Drive/data/Cleaned_Transactions.csv", index=False)

print("EDA Completed. Business insights generated.")


Business insights PDF saved at /content/drive/My Drive/outputs/Sameer_Ahmed_EDA.pdf.
EDA Completed. Business insights generated.
