In [None]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Once data has finished processing, load data from folder structure
# text-data/
# ├─ advertisement/
# ├─ email/
# ├─ invoice/
# ....
data = load_files('../text-data', encoding='utf-8', decode_error='ignore')

X = data.data               
y = data.target             
class_names = data.target_names 

# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# vectorize text using TF-IDF
vectorizer = TfidfVectorizer(
    # this is all first pass so might want to update after check in
    # limit vocab
    max_features=5000,
    # ignore terms that appear in fewer than 2 documents     
    min_df=2,               
    # ignore terms that appear in more than 80% of documents
    max_df=0.8,
    # using unigrams and bigrams             
    ngram_range=(1, 2), 
    # removing common English words    
    stop_words='english'
)

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [None]:
# train baseline Logistic Regression
base_model = LogisticRegression(max_iter=1000, random_state=42)
base_model.fit(X_train_vectors, y_train)

In [None]:
# eval on test set
y_pred = base_model.predict(X_test_vectors)
print(f"Test Accuracy: {base_model.score(X_test_vectors, y_test)}\n")

print("Baseline Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
# tuning with grid search

# chains vectorizer and model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid = {
    # max vocab size
    'tfidf__max_features': [3000, 5000, 7000],
    # unigrams only, unigrams + bigrams, unigrams + bigrams + trigrams
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # min document frequency - removes rare words
    'tfidf__min_df': [2, 3, 5],
    # max document frequency - removes common words
    'tfidf__max_df': [0.7, 0.8, 0.9],
    # inverse regularization strength
    'clf__C': [0.1, 1, 10],
    # optimization algo
    'clf__solver': ['saga', 'lbfgs'],
    # L2 regularization
    'clf__penalty': ['l2']
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

In [None]:
# evaluate tuned model
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)
print(f"Tuned Test Accuracy: {best_model.score(X_test, y_test)}")

print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_tuned, target_names=class_names))

In [None]:
cm = confusion_matrix(y_test, y_pred_tuned)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names, cmap='Blues')
plt.title("Confusion Matrix (Tuned LR)")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()

plt.savefig('confusion_matrix_tuned_lr.png')

In [None]:
# Get misclassified samples
y_pred_tuned = best_model.predict(X_test)
misclassified_mask = y_test != y_pred_tuned
misclassified_indices = np.where(misclassified_mask)[0]

misclass_df = pd.DataFrame({
    'true_label': [class_names[y_test[i]] for i in misclassified_indices],
    'predicted_label': [class_names[y_pred_tuned[i]] for i in misclassified_indices]
})

misclass_pairs = misclass_df.groupby(['true_label', 'predicted_label']).size().reset_index(name='count')
misclass_pairs = misclass_pairs.sort_values('count', ascending=False)

plt.figure(figsize=(10, 6))
top_pairs = misclass_pairs.head(10)
plt.barh(range(len(top_pairs)), top_pairs['count'], color='#3498db')
plt.yticks(range(len(top_pairs)), 
           [f"{row['true_label']} → {row['predicted_label']}" 
            for _, row in top_pairs.iterrows()])
plt.xlabel('Number of Misclassifications', fontsize=12)
plt.title('Top 10 Misclassification Patterns (LR)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('presentation_misclass_patterns_lr.png', dpi=300, bbox_inches='tight')
plt.show()

error_by_class = pd.DataFrame({
    'Class': class_names,
    'Total': [np.sum(y_test == i) for i in range(len(class_names))],
    'Errors': [np.sum((y_test == i) & (y_pred_tuned != i)) for i in range(len(class_names))],
})
error_by_class['Error Rate (%)'] = (error_by_class['Errors'] / error_by_class['Total'] * 100).round(1)
error_by_class = error_by_class.sort_values('Error Rate (%)', ascending=False)

plt.figure(figsize=(10, 6))
colors = ['#e74c3c' if x > error_by_class['Error Rate (%)'].median() else '#3498db' 
          for x in error_by_class['Error Rate (%)']]
plt.barh(error_by_class['Class'], error_by_class['Error Rate (%)'], color=colors)
plt.xlabel('Error Rate (%)', fontsize=12)
plt.title('Classification Error Rate by Class (LR)', fontsize=14, fontweight='bold')
plt.axvline(error_by_class['Error Rate (%)'].median(), color='gray', linestyle='--', 
            linewidth=1, label='Median')
plt.legend()
plt.tight_layout()
plt.savefig('presentation_error_by_class_lr.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# feature analysis
tuned_vectorizer = best_model.named_steps['tfidf']
tuned_clf = best_model.named_steps['clf']
feature_names = np.array(tuned_vectorizer.get_feature_names_out())

top_n = 10
for i, class_label in enumerate(class_names):
    # get and print top weights and features
    coefficient = tuned_clf.coef_[i]
    top_indices = np.argsort(coefficient)[-top_n:]
    top_features = feature_names[top_indices]
    top_weights = coefficient[top_indices]
    print(f"Top features for class '{class_label}':")
    for feature, weight in zip(top_features, top_weights):
        print(f"{feature}: {weight:.4f}")

In [None]:
# save model
import joblib
joblib.dump(best_model, 'lr_tuned.pkl')