In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.datasets import fetch_20newsgroups

In [4]:
with open('train_test_data.pkl', 'rb') as f:
    data = pickle.load(f)
X_train_full = data['X_train']
X_test = data['X_test']
y_train_full = data['y_train']
y_test = data['y_test']
feature_names = data['feature_names']
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
category_names = newsgroups.target_names

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_unlabeled, y_train, y_unlabeled = train_test_split(
    X_train_full, y_train_full,
    train_size=0.1,      
    random_state=42,
    stratify=y_train_full  
)

In [6]:
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {category_names[label]}: {count} samples")
    print(f"        ... ({len(unique)} categories total)")

Class alt.atheism: 37 samples
        ... (20 categories total)
Class comp.graphics: 45 samples
        ... (20 categories total)
Class comp.os.ms-windows.misc: 45 samples
        ... (20 categories total)
Class comp.sys.ibm.pc.hardware: 46 samples
        ... (20 categories total)
Class comp.sys.mac.hardware: 45 samples
        ... (20 categories total)
Class comp.windows.x: 47 samples
        ... (20 categories total)
Class misc.forsale: 46 samples
        ... (20 categories total)
Class rec.autos: 45 samples
        ... (20 categories total)
Class rec.motorcycles: 46 samples
        ... (20 categories total)
Class rec.sport.baseball: 46 samples
        ... (20 categories total)
Class rec.sport.hockey: 47 samples
        ... (20 categories total)
Class sci.crypt: 47 samples
        ... (20 categories total)
Class sci.electronics: 46 samples
        ... (20 categories total)
Class sci.med: 46 samples
        ... (20 categories total)
Class sci.space: 46 samples
        ... (20 categor

In [12]:
model=LogisticRegression(max_iter=1000,
                         solver='lbfgs',
                         verbose=1,
                         random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

#accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.6075
Precision: 0.6305
Recall: 0.6075
F1 Score: 0.5921


In [17]:
report_dict = classification_report(
    y_test, y_pred,
    target_names=category_names,
    output_dict=True
)
class_f1 = {cat: report_dict[cat]['f1-score'] for cat in category_names}
best_classes=sorted(class_f1.items(), key=lambda x: x[1], reverse=True)[:5]
worst_classes=sorted(class_f1.items(), key=lambda x: x[1])[:5]

for cat,f1_score in best_classes:
    print(f"Best class: {cat} with F1-score: {f1_score:.4f}")
for cat,f1_score in worst_classes:
    print(f"Worst class: {cat} with F1-score: {f1_score:.4f}")

Best class: rec.sport.hockey with F1-score: 0.7837
Best class: talk.politics.mideast with F1-score: 0.7714
Best class: soc.religion.christian with F1-score: 0.6931
Best class: rec.sport.baseball with F1-score: 0.6693
Best class: sci.med with F1-score: 0.6615
Worst class: talk.religion.misc with F1-score: 0.0541
Worst class: alt.atheism with F1-score: 0.3750
Worst class: talk.politics.misc with F1-score: 0.4923
Worst class: comp.os.ms-windows.misc with F1-score: 0.5210
Worst class: comp.graphics with F1-score: 0.5213


In [20]:
with open('baseline_model.pkl', 'wb') as f:
    pickle.dump(model, f)

baseline_metrics = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'training_size': len(y_train),
    'test_size': len(y_test),
    'best_classes': best_classes,
    'worst_classes': worst_classes
}
with open('baseline_metrics.pkl', 'wb') as f:
    pickle.dump(baseline_metrics, f)