In [12]:
import numpy as np
import pandas as pd
import pickle
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.ensemble import BaggingClassifier as Bagging
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [13]:
# Mapping labels to categories
CATEGORY_MAPPING = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

# Load and preprocess the data
def load_and_preprocess_data():
    dataset_name = "fancyzhx/ag_news"
    dataset = load_dataset(dataset_name)

    train_df = dataset['train'].to_pandas()
    test_df = dataset['test'].to_pandas()

    # Map numeric labels to text categories
    train_df['Category'] = train_df['label'].map(CATEGORY_MAPPING)
    test_df['Category'] = test_df['label'].map(CATEGORY_MAPPING)

    return train_df, test_df

In [21]:
# Bagging Classifier
class BaggingClassifier:
    def __init__(self, n_estimators=50, random_state=42):
        self.base_models = [
            ('lr', LogisticRegression(max_iter=1000)),
            ('nb', MultinomialNB()),
            ('dt', DecisionTreeClassifier(max_depth=10))
        ]
        self.voting_clf = VotingClassifier(estimators=self.base_models, voting='soft')
        self.model = Bagging(estimator=self.voting_clf, n_estimators=n_estimators, random_state=random_state)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def save(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self.model, f)

In [22]:
# Boosting Classifier
class BoostingClassifier:
    def __init__(self, n_estimators=10, random_state=42):
        self.base_models = [
            ('lr', LogisticRegression(max_iter=1000)),
            ('nb', MultinomialNB()),
            ('dt', DecisionTreeClassifier(max_depth=100))
        ]
        self.voting_clf = VotingClassifier(estimators=self.base_models, voting='soft', weights=[2, 1, 1])
        self.model = AdaBoostClassifier(estimator=self.voting_clf, n_estimators=n_estimators, random_state=random_state)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def save(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self.model, f)

In [23]:
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=CATEGORY_MAPPING.values()))

In [24]:
train_df, test_df = load_and_preprocess_data()

X_train_text = train_df['text']
y_train = train_df['label']
X_test_text = test_df['text']
y_test = test_df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

bagging_clf = BaggingClassifier()
boosting_clf = BoostingClassifier()

In [25]:
bagging_clf.fit(X_train, y_train)
evaluate_model(bagging_clf, X_test, y_test, "Bagging Classifier")
bagging_clf.save("bagging_model.pkl")

boosting_clf.fit(X_train, y_train)
evaluate_model(boosting_clf, X_test, y_test, "Boosting Classifier")
boosting_clf.save("boosting_model.pkl")

# Save vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


Bagging Classifier Accuracy: 0.9030
              precision    recall  f1-score   support

       World       0.92      0.90      0.91      1900
      Sports       0.94      0.98      0.96      1900
    Business       0.87      0.86      0.87      1900
    Sci/Tech       0.88      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600


Boosting Classifier Accuracy: 0.8550
              precision    recall  f1-score   support

       World       0.87      0.85      0.86      1900
      Sports       0.90      0.93      0.92      1900
    Business       0.84      0.81      0.82      1900
    Sci/Tech       0.81      0.83      0.82      1900

    accuracy                           0.85      7600
   macro avg       0.85      0.85      0.85      7600
weighted avg       0.85      0.85      0.85      7600

