In [None]:
# ============================================================
# NEWS CLASSIFICATION MODEL TRAINING
# ============================================================

# Step 1: Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import make_pipeline
import numpy as np


In [None]:
# Step 2: Load dataset
df = pd.read_csv("Global_News_Dataset.csv")

print("Dataset shape:", df.shape)
print(df.head())


In [None]:
# Step 3: Inspect missing values and datatypes
print("\nMissing values per column:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)


In [None]:
# Step 4: Data Cleaning
df = df.dropna(subset=["category"])
df["author"] = df["author"].fillna("unknown")
df["source_id"] = df["source_id"].fillna(df["source_name"])
df["source_id"] = df["source_id"].fillna("unknown")
df["url_to_image"] = df["url_to_image"].fillna("no_image")

text_columns = ["author", "title", "description", "content", "full_content"]
for col in text_columns:
    df[col] = df[col].astype(str).str.strip().replace("", "unknown")

df["content_clean"] = df["content"].astype(str).str.lower()

missing_total = df.isnull().sum().sum()
print(f"\nMissing values after cleaning: {missing_total}")


In [None]:
# Step 5: Train-Test Split
X = df["content_clean"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining size: {X_train.shape[0]}")
print(f"Testing size: {X_test.shape[0]}")


In [None]:
# Step 6: Build Pipeline (Vectorizer + Model)
vectorizer = TfidfVectorizer(max_features=5000)
model = LogisticRegression(max_iter=200, n_jobs=-1)

# Create pipeline
pipeline = make_pipeline(vectorizer, model)


In [None]:
# Step 7: Train the model
print("\nTraining model...")
pipeline.fit(X_train, y_train)


In [None]:
# Step 8: Evaluate model
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"F1 Score: {f1:.4f}")



In [None]:
# Step 9: Confusion Matrix (Top 10 categories)
top_classes = y_train.value_counts().nlargest(10).index
mask = y_test.isin(top_classes)

cm = confusion_matrix(y_test[mask], y_pred[mask], labels=top_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=top_classes)

fig, ax = plt.subplots(figsize=(10, 8))
disp.plot(ax=ax, cmap="Blues", xticks_rotation=45)
plt.title("Confusion Matrix (Top 10 Categories)")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

print("\nConfusion matrix saved as 'confusion_matrix.png'")

# Step 10: Classification Report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred))

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)


In [None]:
# ============================================================
#  LIME EXPLAINABILITY ANALYSIS
# ============================================================

# Import LIME
from lime.lime_text import LimeTextExplainer

print("\n" + "="*60)
print("LIME EXPLAINABILITY ANALYSIS")
print("="*60)



In [None]:
# Step 1: Initialize LIME Explainer
class_names = pipeline.classes_
explainer = LimeTextExplainer(class_names=class_names)

print(f"\nInitialized LIME explainer for {len(class_names)} categories")


In [None]:
# Step 2: Function to explain predictions (FIXED)
def explain_prediction(idx, num_features=10, save_html=True):
    """
    Explain a specific prediction using LIME
    
    Parameters:
    -----------
    idx : int
        Index of the test sample to explain
    num_features : int
        Number of top words to show in explanation
    save_html : bool
        Whether to save HTML explanation file
    """
    # Get the text and true label
    text = X_test.iloc[idx]
    true_label = y_test.iloc[idx]
    predicted_label = pipeline.predict([text])[0]
      
    print(f"\n{'='*60}")
    print(f"EXPLANATION FOR TEST SAMPLE #{idx}")
    print(f"{'='*60}")
    print(f"\nTrue Category: {true_label}")
    print(f"Predicted Category: {predicted_label}")
    print(f"\nArticle Text (first 300 chars):")
    print(f"{text[:300]}...")
    
    # Generate explanation
    print(f"\nGenerating LIME explanation...")
    exp = explainer.explain_instance(
        text, 
        pipeline.predict_proba, 
        num_features=num_features,
        top_labels=3
    )
    
    # Show explanation in console
    print(f"\nTop {num_features} words influencing the prediction:")
    print("-" * 60)
    
    for label in exp.available_labels()[:3]:
        category = class_names[label]
        print(f"\n{str(category).upper()}:")  # FIXED: Convert to string
        word_weights = exp.as_list(label=label)
        for word, weight in word_weights:
            direction = "POSITIVE" if weight > 0 else "NEGATIVE"
            print(f"  {word:20s} | {weight:+.4f} | {direction}")
    
    # Get prediction probabilities
    probs = pipeline.predict_proba([text])[0]
    print(f"\nPrediction Probabilities (Top 5):")
    print("-" * 60)
    top_indices = np.argsort(probs)[-5:][::-1]
    for i in top_indices:
        category_name = str(class_names[i])  # FIXED: Convert to string first
        print(f"{category_name:20s} | {probs[i]:.4f} ({probs[i]*100:.2f}%)")
    
     # Save HTML visualization
    if save_html:
        html_file = f"lime_explanation_sample_{idx}.html"
        exp.save_to_file(html_file)
        print(f"\nHTML explanation saved to: {html_file}")
    
    return exp


In [None]:
# Step 3: Explain multiple predictions
print("\n" + "="*60)
print("GENERATING EXPLANATIONS FOR SAMPLE PREDICTIONS")
print("="*60)

# Explain 3 random test samples
np.random.seed(42)
sample_indices = np.random.choice(len(X_test), size=3, replace=False)

explanations = []
for idx in sample_indices:
    exp = explain_prediction(idx, num_features=10, save_html=True)
    explanations.append(exp)
    print("\n" + "="*60 + "\n")


In [None]:
# Step 4: Explain misclassifications
print("\n" + "="*60)
print("EXPLAINING MISCLASSIFICATIONS")
print("="*60)

def explain_misclassifications(num_samples=3):
    """Explain predictions that were incorrect"""
    misclassified_mask = y_test != y_pred
    misclassified_indices = np.where(misclassified_mask)[0]
    
    print(f"\nTotal misclassifications: {len(misclassified_indices)}")
    print(f"Explaining {min(num_samples, len(misclassified_indices))} examples...")
    
    if len(misclassified_indices) == 0:
        print("No misclassifications found!")
        return
    
    sample_size = min(num_samples, len(misclassified_indices))
    selected = np.random.choice(misclassified_indices, size=sample_size, replace=False)
    
    for idx in selected:
        explain_prediction(idx, num_features=10, save_html=True)
        print("\n" + "-"*60 + "\n")

explain_misclassifications(num_samples=3)
        

In [None]:
# Step 5: Summary
print("\n" + "="*60)
print("EXPLAINABILITY SUMMARY")
print("="*60)
print(f"\nTotal test samples: {len(X_test)}")
print(f"Correctly classified: {(y_test == y_pred).sum()}")
print(f"Misclassified: {(y_test != y_pred).sum()}")
print(f"\nLIME explanations generated for sample predictions")
print(f"HTML files saved in current directory")
print("\n" + "="*60)
print("ANALYSIS COMPLETE!")
print("="*60)