In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import load_npz, save_npz, hstack
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import fetch_20newsgroups
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')

print("Step 5: Advanced Feature Engineering & Optimization")
print("Current Best: SVM with 0.7362 Macro-F1")
print("Target: 0.80+ Macro-F1")
print("="*80)


Step 5: Advanced Feature Engineering & Optimization
Current Best: SVM with 0.7362 Macro-F1
Target: 0.80+ Macro-F1


In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, 
                                       remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, 
                                      remove=('headers', 'footers', 'quotes'))

X_train_full = newsgroups_train.data
y_train_full = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, stratify=y_train_full, random_state=42
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


Train: 9616, Val: 1698, Test: 7532


In [3]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

additional_stopwords = {'ax', 'max', 'article', 'writes', 'posting', 'post', 'subject', 'lines', 'organization'}
stop_words = list(ENGLISH_STOP_WORDS.union(additional_stopwords))

def clean_text_advanced(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

X_train_clean_adv = [clean_text_advanced(doc) for doc in X_train]
X_val_clean_adv = [clean_text_advanced(doc) for doc in X_val]
X_test_clean_adv = [clean_text_advanced(doc) for doc in X_test]

print("Advanced cleaning applied")
print(f"Sample: {X_train_clean_adv[0][:200]}")


Advanced cleaning applied
Sample: y all lighten up on harry skip ll be like that in a couple of years harry s a great personality he s the reason i like cubs broadcasts it s certainly not the quality of the team chop chop michael mule


In [4]:
tfidf_v1 = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=50000,
    min_df=2,
    max_df=0.7,
    sublinear_tf=True,
    stop_words=stop_words
)

X_train_v1 = tfidf_v1.fit_transform(X_train_clean_adv)
X_val_v1 = tfidf_v1.transform(X_val_clean_adv)

print(f"\nStrategy 1 - Optimized Word TF-IDF:")
print(f"  Features: {X_train_v1.shape[1]:,}")
print(f"  Sparsity: {(1.0 - X_train_v1.nnz / (X_train_v1.shape[0] * X_train_v1.shape[1]))*100:.2f}%")

svm_v1 = LinearSVC(C=1.0, max_iter=2000, dual=False, random_state=42)
svm_v1.fit(X_train_v1, y_train)
y_val_pred_v1 = svm_v1.predict(X_val_v1)
f1_v1 = f1_score(y_val, y_val_pred_v1, average='macro')

print(f"  SVM Macro-F1: {f1_v1:.4f}")
print(f"  Gain over baseline: {f1_v1 - 0.7362:+.4f}")



Strategy 1 - Optimized Word TF-IDF:
  Features: 50,000
  Sparsity: 99.86%
  SVM Macro-F1: 0.7581
  Gain over baseline: +0.0219


In [5]:
tfidf_word_v2 = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=40000,
    min_df=3,
    max_df=0.8,
    sublinear_tf=True,
    stop_words=stop_words,
    analyzer='word'
)

tfidf_char_v2 = TfidfVectorizer(
    ngram_range=(3, 6),
    max_features=30000,
    min_df=3,
    max_df=0.95,
    sublinear_tf=True,
    analyzer='char'
)

X_train_word_v2 = tfidf_word_v2.fit_transform(X_train_clean_adv)
X_train_char_v2 = tfidf_char_v2.fit_transform(X_train)

X_train_v2 = hstack([X_train_word_v2, X_train_char_v2])

X_val_word_v2 = tfidf_word_v2.transform(X_val_clean_adv)
X_val_char_v2 = tfidf_char_v2.transform(X_val)
X_val_v2 = hstack([X_val_word_v2, X_val_char_v2])

print(f"\nStrategy 2 - Enhanced Hybrid (Word+Char):")
print(f"  Features: {X_train_v2.shape[1]:,}")
print(f"  Sparsity: {(1.0 - X_train_v2.nnz / (X_train_v2.shape[0] * X_train_v2.shape[1]))*100:.2f}%")

svm_v2 = LinearSVC(C=1.0, max_iter=2000, dual=False, random_state=42)
svm_v2.fit(X_train_v2, y_train)
y_val_pred_v2 = svm_v2.predict(X_val_v2)
f1_v2 = f1_score(y_val, y_val_pred_v2, average='macro')

print(f"  SVM Macro-F1: {f1_v2:.4f}")
print(f"  Gain over baseline: {f1_v2 - 0.7362:+.4f}")



Strategy 2 - Enhanced Hybrid (Word+Char):
  Features: 70,000
  Sparsity: 97.81%
  SVM Macro-F1: 0.7454
  Gain over baseline: +0.0092


In [6]:
tfidf_v3 = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=30000,
    min_df=5,
    max_df=0.5,
    sublinear_tf=True,
    stop_words=stop_words
)

X_train_v3 = tfidf_v3.fit_transform(X_train_clean_adv)
X_val_v3 = tfidf_v3.transform(X_val_clean_adv)

print(f"\nStrategy 3 - Aggressive Filtering (max_df=0.5):")
print(f"  Features: {X_train_v3.shape[1]:,}")

svm_v3 = LinearSVC(C=1.0, max_iter=2000, dual=False, random_state=42)
svm_v3.fit(X_train_v3, y_train)
y_val_pred_v3 = svm_v3.predict(X_val_v3)
f1_v3 = f1_score(y_val, y_val_pred_v3, average='macro')

print(f"  SVM Macro-F1: {f1_v3:.4f}")
print(f"  Gain over baseline: {f1_v3 - 0.7362:+.4f}")



Strategy 3 - Aggressive Filtering (max_df=0.5):
  Features: 21,447
  SVM Macro-F1: 0.7406
  Gain over baseline: +0.0044


In [7]:
best_strategy = max([(f1_v1, 'v1', X_train_v1, X_val_v1), 
                      (f1_v2, 'v2', X_train_v2, X_val_v2),
                      (f1_v3, 'v3', X_train_v3, X_val_v3)], key=lambda x: x[0])

best_f1, best_name, X_train_best, X_val_best = best_strategy

print(f"\nBest Feature Strategy: {best_name} with F1={best_f1:.4f}")
print("\nStarting Grid Search on SVM hyperparameters...")

param_grid = {
    'C': [0.5, 1.0, 2.0, 5.0, 10.0],
    'max_iter': [2000]
}

svm_grid = LinearSVC(dual=False, random_state=42)
grid_search = GridSearchCV(
    svm_grid,
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

start = time.time()
grid_search.fit(X_train_best, y_train)
grid_time = time.time() - start

print(f"\nGrid Search Complete ({grid_time:.1f}s)")
print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

best_svm = grid_search.best_estimator_
y_val_pred_best = best_svm.predict(X_val_best)
f1_tuned = f1_score(y_val, y_val_pred_best, average='macro')

print(f"Validation Macro-F1: {f1_tuned:.4f}")
print(f"Gain over baseline: {f1_tuned - 0.7362:+.4f}")



Best Feature Strategy: v1 with F1=0.7581

Starting Grid Search on SVM hyperparameters...
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Grid Search Complete (9.5s)
Best params: {'C': 1.0, 'max_iter': 2000}
Best CV score: 0.7455
Validation Macro-F1: 0.7581
Gain over baseline: +0.0219


In [8]:
print("\nTesting Logistic Regression with optimal features...")

param_grid_lr = {
    'C': [0.5, 1.0, 2.0, 5.0, 10.0],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial']
}

lr_grid = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
grid_search_lr = GridSearchCV(
    lr_grid,
    param_grid_lr,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

grid_search_lr.fit(X_train_best, y_train)

print(f"\nBest params: {grid_search_lr.best_params_}")
print(f"Best CV score: {grid_search_lr.best_score_:.4f}")

best_lr = grid_search_lr.best_estimator_
y_val_pred_lr = best_lr.predict(X_val_best)
f1_lr_tuned = f1_score(y_val, y_val_pred_lr, average='macro')

print(f"Validation Macro-F1: {f1_lr_tuned:.4f}")



Testing Logistic Regression with optimal features...
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Best params: {'C': 10.0, 'multi_class': 'multinomial', 'solver': 'lbfgs'}
Best CV score: 0.7396
Validation Macro-F1: 0.7481


In [9]:
print("\nBuilding Voting Ensemble...")

ensemble_voting = VotingClassifier(
    estimators=[
        ('svm', best_svm),
        ('lr', best_lr),
        ('nb', MultinomialNB(alpha=0.1))
    ],
    voting='hard',
    n_jobs=-1
)

ensemble_voting.fit(X_train_best, y_train)
y_val_pred_ensemble = ensemble_voting.predict(X_val_best)
f1_ensemble = f1_score(y_val, y_val_pred_ensemble, average='macro')

print(f"Ensemble Voting Macro-F1: {f1_ensemble:.4f}")
print(f"Gain over best single model: {f1_ensemble - max(f1_tuned, f1_lr_tuned):+.4f}")



Building Voting Ensemble...
Ensemble Voting Macro-F1: 0.7567
Gain over best single model: -0.0014


In [12]:
results_step5 = pd.DataFrame({
    'Approach': [
        'Baseline (Step 4)',
        'Strategy 1: Word(1-3) + stopwords',
        'Strategy 2: Word+Char Enhanced',
        'Strategy 3: Aggressive Filter',
        'Best + SVM GridSearch',
        'Best + LogReg GridSearch',
        'Voting Ensemble'
    ],
    'Macro_F1': [
        0.7362,
        f1_v1,
        f1_v2,
        f1_v3,
        f1_tuned,
        f1_lr_tuned,
        f1_ensemble
    ],
    'Gain': [
        0,
        f1_v1 - 0.7362,
        f1_v2 - 0.7362,
        f1_v3 - 0.7362,
        f1_tuned - 0.7362,
        f1_lr_tuned - 0.7362,
        f1_ensemble - 0.7362
    ]
})

results_step5 = results_step5.sort_values('Macro_F1', ascending=False)

print("\n" + "="*80)
print("STEP 5 RESULTS: Feature Engineering & Optimization")
print("="*80)
print(results_step5.round(4).to_string(index=False))
print("="*80)

best_approach = results_step5.iloc[0]


STEP 5 RESULTS: Feature Engineering & Optimization
                         Approach  Macro_F1   Gain
Strategy 1: Word(1-3) + stopwords    0.7581 0.0219
            Best + SVM GridSearch    0.7581 0.0219
                  Voting Ensemble    0.7567 0.0205
         Best + LogReg GridSearch    0.7481 0.0119
   Strategy 2: Word+Char Enhanced    0.7454 0.0092
    Strategy 3: Aggressive Filter    0.7406 0.0044
                Baseline (Step 4)    0.7362 0.0000


In [13]:
if best_approach['Approach'] == 'Voting Ensemble':
    joblib.dump(ensemble_voting, 'models/best_step5_ensemble.pkl')
    final_model = ensemble_voting
    print("\nðŸ’¾ Saved: models/best_step5_ensemble.pkl")
elif 'SVM' in best_approach['Approach']:
    joblib.dump(best_svm, 'models/best_step5_svm.pkl')
    final_model = best_svm
    print("\nðŸ’¾ Saved: models/best_step5_svm.pkl")
else:
    joblib.dump(best_lr, 'models/best_step5_lr.pkl')
    final_model = best_lr
    print("\nðŸ’¾ Saved: models/best_step5_lr.pkl")

if best_name == 'v1':
    joblib.dump(tfidf_v1, 'models/tfidf_best_step5.pkl')
elif best_name == 'v2':
    joblib.dump(tfidf_word_v2, 'models/tfidf_word_best_step5.pkl')
    joblib.dump(tfidf_char_v2, 'models/tfidf_char_best_step5.pkl')
else:
    joblib.dump(tfidf_v3, 'models/tfidf_best_step5.pkl')

print("ðŸ’¾ Saved vectorizers")



ðŸ’¾ Saved: models/best_step5_lr.pkl
ðŸ’¾ Saved vectorizers


In [14]:
category_names = newsgroups_train.target_names

if best_approach['Approach'] == 'Voting Ensemble':
    y_val_final = y_val_pred_ensemble
elif 'SVM' in best_approach['Approach']:
    y_val_final = y_val_pred_best
else:
    y_val_final = y_val_pred_lr

print("\n" + "="*80)
print(f"CLASSIFICATION REPORT - {best_approach['Approach']}")
print("="*80)
print(classification_report(y_val, y_val_final, target_names=category_names, digits=4))



CLASSIFICATION REPORT - Strategy 1: Word(1-3) + stopwords
                          precision    recall  f1-score   support

             alt.atheism     0.7000    0.5833    0.6364        72
           comp.graphics     0.7128    0.7614    0.7363        88
 comp.os.ms-windows.misc     0.7011    0.6854    0.6932        89
comp.sys.ibm.pc.hardware     0.6250    0.6818    0.6522        88
   comp.sys.mac.hardware     0.7901    0.7356    0.7619        87
          comp.windows.x     0.8171    0.7528    0.7836        89
            misc.forsale     0.7805    0.7273    0.7529        88
               rec.autos     0.4931    0.7978    0.6094        89
         rec.motorcycles     0.8462    0.7333    0.7857        90
      rec.sport.baseball     0.7938    0.8652    0.8280        89
        rec.sport.hockey     0.9512    0.8667    0.9070        90
               sci.crypt     0.8831    0.7640    0.8193        89
         sci.electronics     0.7228    0.8202    0.7684        89
                