In [22]:
# ===============================================
# 📚 Dynamic EDA + Auto ML + XAI + PDF Generator
# ===============================================

# Install necessary packages
#!pip install pandas numpy matplotlib seaborn scikit-learn shap lime fpdf2 plotly

In [40]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import shap
import lime
import lime.lime_tabular
import joblib
import datetime
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from fpdf import FPDF
from IPython.display import display

In [23]:
# ===============================
# 📥 Step 1: File Upload
# ===============================

from google.colab import files
uploaded = files.upload()

# Auto-detect file
for fn in uploaded.keys():
    dataset = pd.read_csv(fn)

Saving StudentsPerformance.csv to StudentsPerformance (3).csv


In [42]:
# ===============================
# 📊 Step 2: Dynamic EDA (Fixed)
# ===============================

# Detect numerical and categorical columns
df = dataset.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Create folders for images
os.makedirs("eda_charts", exist_ok=True)

# Function to sanitize filenames
def sanitize_filename(name):
    return name.replace('/', '_').replace('\\', '_')

# Descriptive Statistics
descriptive_stats = df.describe(include='all')

# Missing Values
missing_values = df.isnull().sum()

# Correlation Matrix
correlation_matrix = df[numeric_cols].corr()

# 📈 Plot Numerical Distributions
for col in numeric_cols:
    safe_col = sanitize_filename(col)
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.savefig(f"eda_charts/{safe_col}_dist.png")
    plt.close()

# 📈 Boxplots and Violin Plots
for col in numeric_cols:
    safe_col = sanitize_filename(col)
    fig, ax = plt.subplots(1, 2, figsize=(12,5))
    sns.boxplot(y=df[col], ax=ax[0])
    sns.violinplot(y=df[col], ax=ax[1])
    ax[0].set_title(f'Boxplot of {col}')
    ax[1].set_title(f'Violinplot of {col}')
    plt.savefig(f"eda_charts/{safe_col}_box_violin.png")
    plt.close()

# 📈 Categorical Countplots
for col in categorical_cols:
    safe_col = sanitize_filename(col)
    plt.figure(figsize=(8,5))
    sns.countplot(x=df[col])
    plt.xticks(rotation=45)
    plt.title(f'Countplot of {col}')
    plt.savefig(f"eda_charts/{safe_col}_countplot.png")
    plt.close()

# 📈 Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig("eda_charts/correlation_heatmap.png")
plt.close()

In [43]:
# ===============================
# 🧠 Step 3: Model Building (Fixed)
# ===============================

# Auto Problem Type Detection
target = df.columns[-1]

if df[target].dtype == 'object':
    problem_type = 'classification'
else:
    problem_type = 'regression'

# Prepare Data
X = df.drop(columns=[target])
y = df[target]

# Encode Categorical Variables if any
X_encoded = pd.get_dummies(X, drop_first=True)
if y.dtype == 'object':
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y = le.fit_transform(y)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train Model
if problem_type == 'classification':
    model = RandomForestClassifier(random_state=42)
else:
    model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Metrics
if problem_type == 'classification':
    model_score = accuracy_score(y_test, y_pred)
    model_report = classification_report(y_test, y_pred)
else:
    model_score = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    model_rmse = np.sqrt(mse)

# Save Model
model_filename = f"trained_model_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pkl"
joblib.dump(model, model_filename)

['trained_model_20250428_165949.pkl']

In [45]:
# ===============================
# 🔍 Step 4: Explainable AI (SHAP + LIME)
# ===============================

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# SHAP Summary
plt.figure()
shap.summary_plot(shap_values, X_test, show=False)
plt.savefig("eda_charts/shap_summary.png")
plt.close()

# SHAP Dependence (only first feature for demo)
plt.figure()
shap.dependence_plot(X_test.columns[0], shap_values, X_test, show=False)
plt.savefig("eda_charts/shap_dependence.png")
plt.close()

# LIME (Fixed)
try:
    if problem_type == 'classification':
        lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=[str(i) for i in set(y_train)], discretize_continuous=True)
        lime_exp = lime_explainer.explain_instance(X_test.iloc[0].values, model.predict_proba, num_features=5)
    else:
        lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, discretize_continuous=True)
        lime_exp = lime_explainer.explain_instance(X_test.iloc[0].values, model.predict, num_features=5)
    lime_exp.save_to_file("eda_charts/lime_explanation.html")
except Exception as e:
    print(f"LIME explanation skipped due to error: {e}")




LIME explanation skipped due to error: LIME does not currently support classifier models without probability scores. If this conflicts with your use case, please let us know: https://github.com/datascienceinc/lime/issues/16


<Figure size 640x480 with 0 Axes>

In [53]:
#!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/244.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [64]:
# ===============================
# 📄 Step 5: Generate Professional Unicode-safe PDF/Word Report (Fixed Font + Safe Filenames)
# ===============================

!pip install python-docx

import re
from fpdf import FPDF, YPos
from docx import Document
from docx.shared import Pt
import datetime
import os
from google.colab import files

# Helper function to sanitize filenames
def sanitize_filename(name):
    name = re.sub(r'[\W_]+', '_', name)
    return name.strip('_')

class PDF(FPDF):
    def header(self):
        self.set_font('Helvetica', '', 24)
        self.cell(0, 10, 'Automated ML Analysis Report', new_y=YPos.NEXT, align='C')
        self.ln(5)

    def chapter_title(self, title):
        self.set_font('Helvetica', '', 18)
        self.cell(0, 10, title, new_y=YPos.NEXT, align='L')
        self.ln(3)

    def chapter_body(self, body):
        self.set_font('Helvetica', '', 12)
        self.multi_cell(0, 10, body)
        self.ln(5)

    def add_image(self, path, w=180):
        if os.path.exists(path):
            self.image(path, w=w)
            self.ln(5)

now = datetime.datetime.now()
pdf = PDF()
pdf.set_auto_page_break(auto=True, margin=15)

# Cover Page
pdf.add_page()
pdf.set_font('Helvetica', '', 28)
pdf.cell(0, 20, 'Dynamic Machine Learning Report', new_y=YPos.NEXT, align='C')
pdf.ln(20)

# Start Proper Report on Page 2
pdf.add_page()

# About Dataset
pdf.chapter_title('About the Dataset')
pdf.chapter_body(f"The dataset contains {df.shape[0]} observations across {df.shape[1]} columns. Among them, {len(numeric_cols)} are numerical and {len(categorical_cols)} are categorical variables. A robust understanding of the data structure aids in selecting appropriate analysis and modeling techniques.")

# Exploratory Data Analysis
pdf.chapter_title('Exploratory Data Analysis')
for col in numeric_cols:
    safe_col = sanitize_filename(col)
    min_val = df[col].min()
    max_val = df[col].max()
    mean_val = df[col].mean()
    std_val = df[col].std()
    variability = 'low' if std_val < 0.1 * mean_val else 'moderate' if std_val < 0.5 * mean_val else 'high'
    pdf.chapter_body(f"The feature '{col}' ranges from {min_val:.2f} to {max_val:.2f}. The mean value is {mean_val:.2f} with a standard deviation of {std_val:.2f}, indicating {variability} variability. Visual analysis confirms that values cluster within this range, with few potential outliers influencing distribution tails.")
    pdf.add_image(f"eda_charts/{safe_col}_dist.png")
    pdf.chapter_body(f"Boxplots and violin plots of {col} reveal the central tendency around the median and help detect asymmetry and extreme values that may affect model assumptions.")
    pdf.add_image(f"eda_charts/{safe_col}_box_violin.png")

for col in categorical_cols:
    safe_col = sanitize_filename(col)
    pdf.chapter_body(f"The categorical feature '{col}' shows the frequency of each category. Understanding dominant or underrepresented classes is important for ensuring model fairness and effectiveness.")
    pdf.add_image(f"eda_charts/{safe_col}_countplot.png")

pdf.chapter_body("The correlation heatmap visualizes relationships among numerical features. High correlations may indicate multicollinearity, which can bias model coefficients and reduce performance. Addressing these during preprocessing improves model robustness.")
pdf.add_image("eda_charts/correlation_heatmap.png")

# Model Building Section
pdf.chapter_title('Model Building and Evaluation')
pdf.chapter_body(f"The task was identified as {problem_type.capitalize()}. A Random Forest model was used for its ability to capture non-linear relationships effectively without intensive preprocessing. The model achieved a performance score of {model_score:.4f}. {f'An RMSE of {model_rmse:.4f} indicates acceptable prediction accuracy for this domain.' if problem_type == 'regression' else ''}")

# XAI Section
pdf.chapter_title('Explainable AI (XAI) Results')
pdf.chapter_body("SHAP analysis identifies major feature influences globally, while dependence plots highlight feature interactions. LIME provides local, instance-based explanations enhancing model transparency.")
pdf.add_image("eda_charts/shap_summary.png")
pdf.add_image("eda_charts/shap_dependence.png")

# Conclusion Section
pdf.chapter_title('Conclusion and Future Recommendations')
pdf.chapter_body("This pipeline successfully automated dataset analysis, model building, evaluation, and interpretability. Future work can focus on model tuning, stacking methods like Gradient Boosting, and adding fairness and accountability checks for responsible AI deployment.")

# Save PDF
pdf_filename = f"ML_Report_{now.strftime('%Y-%m-%d_%H-%M-%S')}.pdf"
pdf.output(pdf_filename)
print(f"\n✅ PDF Report Generated: {pdf_filename}")

# Ask for Word Copy Option
want_word = input("\nDo you want a Word (.docx) copy too? (yes/no): ").strip().lower()
if want_word == 'yes':
    doc = Document()
    doc.add_heading('Dynamic Machine Learning Report', 0)
    doc.add_paragraph(f"Automated ML Analysis Report\nGenerated on: {now.strftime('%Y-%m-%d %H:%M:%S')}")

    doc.add_heading('About the Dataset', level=1)
    doc.add_paragraph(f"{df.shape[0]} observations, {df.shape[1]} columns. Numerical Columns: {numeric_cols}. Categorical Columns: {categorical_cols}.")

    doc.add_heading('Exploratory Data Analysis', level=1)
    for col in numeric_cols:
        doc.add_paragraph(f"Feature '{col}': Range {df[col].min():.2f} to {df[col].max():.2f}. Mean {df[col].mean():.2f}. Std {df[col].std():.2f}.")
    for col in categorical_cols:
        doc.add_paragraph(f"Feature '{col}': categorical distribution analyzed.")

    doc.add_heading('Model Building and Evaluation', level=1)
    doc.add_paragraph(f"Task: {problem_type.capitalize()}\nModel: Random Forest\nPerformance Score: {model_score:.4f}{f', RMSE: {model_rmse:.4f}' if problem_type == 'regression' else ''}")

    doc.add_heading('Explainable AI Results', level=1)
    doc.add_paragraph("SHAP and LIME analyses provided critical global and local model explanations.")

    doc.add_heading('Conclusion and Future Recommendations', level=1)
    doc.add_paragraph("Future improvements include hyperparameter optimization and fairness evaluations.")

    word_filename = f"ML_Report_{now.strftime('%Y-%m-%d_%H-%M-%S')}.docx"
    doc.save(word_filename)
    files.download(word_filename)
    print(f"✅ Word Report Generated: {word_filename}")

# Download the generated PDF file
files.download(pdf_filename)



✅ PDF Report Generated: ML_Report_2025-04-28_17-53-21.pdf

Do you want a Word (.docx) copy too? (yes/no): no


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>