# Text Classification Demo

This notebook demonstrates text classification techniques using various machine learning models and evaluation metrics.

## Setup and Data Loading

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from src.analysis import load_data, preprocess_df

# Generate data if needed
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(800)

# Load and preprocess data
df = load_data(DATA)
df = preprocess_df(df)
print(f"Dataset shape: {df.shape}")
print("Label distribution:")
print(df['label'].value_counts())

## Train-Test Split

In [None]:
# Split data into train and test sets
X = df['joined_tokens']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("Training label distribution:")
print(y_train.value_counts())
print("Test label distribution:")
print(y_test.value_counts())

## Model Training and Evaluation

In [None]:
# Define models to evaluate
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Create pipeline with TF-IDF and model
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=2000)),
        ('classifier', model)
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.3f}")
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'cv_scores': cv_scores,
        'pipeline': pipeline,
        'predictions': y_pred
    }

## Model Comparison

In [None]:
# Compare model performance
accuracies = [results[name]['accuracy'] for name in models]
cv_means = [results[name]['cv_scores'].mean() for name in models]
cv_stds = [results[name]['cv_scores'].std() for name in models]

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(models.keys(), accuracies)
plt.title('Test Accuracy by Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.subplot(1, 2, 2)
plt.bar(models.keys(), cv_means, yerr=cv_stds, capsize=5)
plt.title('Cross-Validation Accuracy by Model')
plt.ylabel('Mean Accuracy')
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

print("\nModel Performance Summary:")
for name in models:
    print(f"{name}: Test Accuracy={results[name]['accuracy']:.3f}, "
          f"CV Mean={results[name]['cv_scores'].mean():.3f}")

## Detailed Classification Report

In [None]:
# Show detailed classification report for best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['pipeline']
y_pred_best = results[best_model_name]['predictions']

print(f"\n=== Detailed Report for {best_model_name} ===")
print(classification_report(y_test, y_pred_best))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=sorted(y_test.unique()), 
            yticklabels=sorted(y_test.unique()))
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## Feature Importance Analysis

In [None]:
# Analyze feature importance for interpretable models
if hasattr(best_model.named_steps['classifier'], 'coef_'):
    # For linear models
    feature_names = best_model.named_steps['tfidf'].get_feature_names_out()
    coefs = best_model.named_steps['classifier'].coef_
    
    print("\nTop Features by Class:")
    for i, class_name in enumerate(sorted(y_test.unique())):
        top_features_idx = np.argsort(coefs[i])[-10:][::-1]
        top_features = [feature_names[idx] for idx in top_features_idx]
        top_weights = [coefs[i][idx] for idx in top_features_idx]
        
        print(f"\n{class_name}:")
        for feature, weight in zip(top_features, top_weights):
            print(f"  {feature}: {weight:.3f}")
            
elif hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    # For tree-based models
    feature_names = best_model.named_steps['tfidf'].get_feature_names_out()
    importances = best_model.named_steps['classifier'].feature_importances_
    
    top_features_idx = np.argsort(importances)[-20:][::-1]
    top_features = [feature_names[idx] for idx in top_features_idx]
    top_importances = [importances[idx] for idx in top_features_idx]
    
    print("\nTop Features Overall:")
    for feature, importance in zip(top_features, top_importances):
        print(f"  {feature}: {importance:.3f}")

## Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for best model
from sklearn.model_selection import GridSearchCV

if best_model_name == 'Logistic Regression':
    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear', 'saga']
    }
elif best_model_name == 'SVM':
    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    }
elif best_model_name == 'Random Forest':
    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20]
    }
else:
    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000]
    }

print(f"\n=== Hyperparameter Tuning for {best_model_name} ===")
grid_search = GridSearchCV(best_model, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

# Evaluate tuned model
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned model accuracy: {tuned_accuracy:.3f}")
print(f"Improvement: {tuned_accuracy - results[best_model_name]['accuracy']:.3f}")

## Model Deployment and Prediction Examples

In [None]:
# Test the model on new examples
print("\n=== Prediction Examples ===")

# Sample texts from each class
sample_texts = []
for label in sorted(y_test.unique()):
    samples = df[df['label'] == label]['text'].sample(2)
    sample_texts.extend(samples.tolist())

# Make predictions
predictions = tuned_model.predict(sample_texts)
probabilities = tuned_model.predict_proba(sample_texts)

for i, (text, pred, prob) in enumerate(zip(sample_texts, predictions, probabilities)):
    print(f"\nExample {i+1}:")
    print(f"Text: {text[:100]}...")
    print(f"Predicted: {pred}")
    print("Probabilities:")
    for class_name, p in zip(sorted(y_test.unique()), prob):
        print(f"  {class_name}: {p:.3f}")

## Text Classification Summary

This notebook demonstrates:
1. Text classification with multiple ML models
2. Model evaluation and comparison
3. Hyperparameter tuning
4. Feature importance analysis
5. Prediction examples

Key insights:
- Different models perform differently on text classification tasks
- TF-IDF features work well with various classifiers
- Hyperparameter tuning can significantly improve performance
- Feature analysis provides interpretability
- The best model depends on the specific dataset and requirements