In [3]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Once data has finished processing, load data from folder structure
# text-data/
# ├─ advertisement/
# ├─ email/
# ├─ invoice/
# ....
data = load_files('../text-data', encoding='utf-8', decode_error='ignore')

X = data.data               
y = data.target             
class_names = data.target_names 

# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# vectorize text using TF-IDF
vectorizer = TfidfVectorizer(
    # this is all first pass so might want to update after check in
    # limit vocab
    max_features=5000,
    # ignore terms that appear in fewer than 2 documents     
    min_df=2,               
    # ignore terms that appear in more than 80% of documents
    max_df=0.8,
    # using unigrams and bigrams             
    ngram_range=(1, 2), 
    # removing common English words    
    stop_words='english'
)

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [5]:
# train baseline Logistic Regression
base_model = LogisticRegression(max_iter=1000, random_state=42)
base_model.fit(X_train_vectors, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [6]:
# eval on test set
y_pred = base_model.predict(X_test_vectors)
print(f"Test Accuracy: {base_model.score(X_test_vectors, y_test)}\n")

print("Baseline Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

Test Accuracy: 0.742

Baseline Classification Report:
                        precision    recall  f1-score   support

         advertisement       0.85      0.78      0.81        64
                budget       0.74      0.80      0.77        64
                 email       0.93      0.90      0.92        60
           file_folder       0.50      0.76      0.61        72
                  form       0.77      0.66      0.71        62
           handwritten       0.77      0.57      0.65        65
               invoice       0.80      0.77      0.79        48
                letter       0.74      0.62      0.67        63
                  memo       0.73      0.70      0.71        66
          news_article       0.73      0.75      0.74        61
          presentation       0.72      0.61      0.66        70
         questionnaire       0.93      0.66      0.77        65
                resume       0.95      0.95      0.95        62
scientific_publication       0.74      0.91      

In [7]:
# tuning with grid search

# chains vectorizer and model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid = {
    # max vocab size
    'tfidf__max_features': [3000, 5000, 7000],
    # unigrams only, unigrams + bigrams, unigrams + bigrams + trigrams
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # min document frequency - removes rare words
    'tfidf__min_df': [2, 3, 5],
    # max document frequency - removes common words
    'tfidf__max_df': [0.7, 0.8, 0.9],
    # inverse regularization strength
    'clf__C': [0.1, 1, 10],
    # optimization algo
    'clf__solver': ['saga', 'lbfgs'],
    # L2 regularization
    'clf__penalty': ['l2']
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits




KeyboardInterrupt: 

In [None]:
# evaluate tuned model
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)
print(f"Tuned Test Accuracy: {best_model.score(X_test, y_test)}")

print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_tuned, target_names=class_names))

In [None]:
cm = confusion_matrix(y_test, y_pred_tuned)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names, cmap='Blues')
plt.title("Confusion Matrix (Tuned LR)")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()

plt.savefig('confusion_matrix_tuned_lr.png')

In [None]:
# Error Analysis for Logistic Regression

# Get misclassified samples
y_pred_tuned = best_model.predict(X_test)
misclassified_mask = y_test != y_pred_tuned
misclassified_indices = np.where(misclassified_mask)[0]

print(f"Total misclassified samples: {len(misclassified_indices)} out of {len(y_test)}")
print(f"Misclassification rate: {len(misclassified_indices)/len(y_test)*100:.2f}%\n")

# Analyze misclassification patterns
misclass_df = pd.DataFrame({
    'true_label': [class_names[y_test[i]] for i in misclassified_indices],
    'predicted_label': [class_names[y_pred_tuned[i]] for i in misclassified_indices],
    'text': [X_test[i][:200] for i in misclassified_indices]  # first 200 chars
})

# Most common misclassification pairs
misclass_pairs = misclass_df.groupby(['true_label', 'predicted_label']).size().reset_index(name='count')
misclass_pairs = misclass_pairs.sort_values('count', ascending=False)
print("Top 10 Misclassification Patterns:")
print(misclass_pairs.head(10))
print()

# Visualize misclassification patterns
plt.figure(figsize=(12, 8))
top_pairs = misclass_pairs.head(15)
plt.barh(range(len(top_pairs)), top_pairs['count'])
plt.yticks(range(len(top_pairs)), 
           [f"{row['true_label']} → {row['predicted_label']}" 
            for _, row in top_pairs.iterrows()])
plt.xlabel('Number of Misclassifications')
plt.title('Top 15 Misclassification Patterns')
plt.tight_layout()
plt.savefig('misclassification_patterns_lr.png', dpi=300, bbox_inches='tight')
plt.show()

# Show example misclassified texts for top patterns
for idx, row in misclass_pairs.head(5).iterrows():
    true_label = row['true_label']
    pred_label = row['predicted_label']
    
    # Get examples of this specific misclassification
    examples = misclass_df[
        (misclass_df['true_label'] == true_label) & 
        (misclass_df['predicted_label'] == pred_label)
    ]
    
    print(f"TRUE: {true_label} | PREDICTED: {pred_label} | Count: {row['count']}")

    # Show 3 examples
    for i, (_, example) in enumerate(examples.head(3).iterrows(), 1):
        text_preview = example['text'][:300].replace('\n', ' ')
        print(f"\nExample {i}:")
        print(f"{text_preview}...")

# Per-class error rate analysis
error_by_class = pd.DataFrame({
    'class': class_names,
    'total_samples': [np.sum(y_test == i) for i in range(len(class_names))],
    'misclassified': [np.sum((y_test == i) & (y_pred_tuned != i)) for i in range(len(class_names))]
})
error_by_class['error_rate'] = error_by_class['misclassified'] / error_by_class['total_samples'] * 100
error_by_class = error_by_class.sort_values('error_rate', ascending=False)

print(error_by_class.to_string(index=False))

# Visualize error rates by class
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Error rate
ax1.barh(error_by_class['class'], error_by_class['error_rate'])
ax1.set_xlabel('Error Rate (%)')
ax1.set_title('Misclassification Rate by Class')
ax1.grid(axis='x', alpha=0.3)

# Absolute misclassifications
ax2.barh(error_by_class['class'], error_by_class['misclassified'])
ax2.set_xlabel('Number of Misclassifications')
ax2.set_title('Absolute Misclassifications by Class')
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('error_analysis_by_class_lr.png', dpi=300, bbox_inches='tight')
plt.show()

# Confidence analysis for misclassifications
decision_scores = best_model.decision_function(X_test_vectors)
probabilities = np.exp(decision_scores) / np.exp(decision_scores).sum(axis=1, keepdims=True)
predicted_probs = probabilities[np.arange(len(y_test)), y_pred_tuned]
true_class_probs = probabilities[np.arange(len(y_test)), y_test]

# Compare confidence for correct vs incorrect predictions
correct_mask = ~misclassified_mask
correct_confidence = predicted_probs[correct_mask]
incorrect_confidence = predicted_probs[misclassified_mask]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(correct_confidence, bins=30, alpha=0.7, label='Correct', color='green')
plt.hist(incorrect_confidence, bins=30, alpha=0.7, label='Incorrect', color='red')
plt.xlabel('Model Confidence (Probability)')
plt.ylabel('Frequency')
plt.title('Model Confidence: Correct vs Incorrect Predictions')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([correct_confidence, incorrect_confidence], labels=['Correct', 'Incorrect'])
plt.ylabel('Model Confidence')
plt.title('Confidence Distribution Comparison')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('confidence_analysis_lr.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nAverage confidence for correct predictions: {correct_confidence.mean():.3f}")
print(f"Average confidence for incorrect predictions: {incorrect_confidence.mean():.3f}")

# Low confidence misclassifications (model was uncertain)
low_conf_threshold = 0.5
low_conf_errors = misclassified_indices[predicted_probs[misclassified_indices] < low_conf_threshold]
print(f"\nMisclassifications with low confidence (<{low_conf_threshold}): {len(low_conf_errors)}")

# High confidence errors (model was very wrong)
high_conf_threshold = 0.8
high_conf_errors = misclassified_indices[predicted_probs[misclassified_indices] > high_conf_threshold]
print(f"Misclassifications with high confidence (>{high_conf_threshold}): {len(high_conf_errors)}")

if len(high_conf_errors) > 0:
    print("HIGH CONFIDENCE ERRORS (Model was very confident but wrong)")
    for idx in high_conf_errors[:3]:
        print(f"\nTrue: {class_names[y_test[idx]]} | Predicted: {class_names[y_pred_tuned[idx]]}")
        print(f"Confidence: {predicted_probs[idx]:.3f}")
        print(f"Text preview: {X_test[idx][:300]}...")

In [None]:
# feature analysis
tuned_vectorizer = best_model.named_steps['tfidf']
tuned_clf = best_model.named_steps['clf']
feature_names = np.array(tuned_vectorizer.get_feature_names_out())

top_n = 10
for i, class_label in enumerate(class_names):
    # get and print top weights and features
    coefficient = tuned_clf.coef_[i]
    top_indices = np.argsort(coefficient)[-top_n:]
    top_features = feature_names[top_indices]
    top_weights = coefficient[top_indices]
    print(f"Top features for class '{class_label}':")
    for feature, weight in zip(top_features, top_weights):
        print(f"{feature}: {weight:.4f}")

In [None]:
# save model
import joblib
joblib.dump(best_model, 'lr_tuned.pkl')