## Cell 1 Import Libraries
### To setup up for our text classification project, I import essential libraries and downloaded additional resources to help for data handling, preprocessing, model training and evaluation.

In [1]:
import os
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rosha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rosha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rosha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Cell 2: Load and Preprocess News Data

### Here we are handling the loading and preprocessing of the news data. The dataset is organised into categories, and we iterate through each category to read text files, concatenate their content, and create a DataFrame. Text preprocessing functions, including lowercasing, punctuation removal, tokenization, stopword removal, and lemmatization, are applied to clean the textual data.

In [2]:
data_dir = "datasets_coursework1/bbc"
os.chdir(data_dir)

categories = ["business", "entertainment", "politics", "sport", "tech"]

news_data_list = []

for category in categories:
    files_in_category = os.listdir(category)
    for text_file in files_in_category:
        file_path = os.path.join(category, text_file)
        with open(file_path, encoding='unicode_escape') as file:
            content = ' '.join(file.readlines())
        news_data_list.append({'news': content, 'category': category})

df = pd.DataFrame(news_data_list)

def preprocess_text(raw_text):
    processed_text = raw_text.lower()
    processed_text = ''.join([char for char in processed_text if char not in string.punctuation])
    words = word_tokenize(processed_text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    processed_text = ' '.join(words)
    return processed_text

df['news'] = df['news'].apply(preprocess_text)

## Cell 3: Train-Val-Test Split

#### The dataset is split into training, validation, and test sets. The variables are assigned to store the news content and corresponding categories for each set.

In [3]:
train_df, test_df = train_test_split(df, test_size=0.20, random_state=22, stratify=df["category"])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=22, stratify=train_df["category"])

train_n, train_cat = train_df["news"], train_df["category"]
val_n, val_cat = val_df["news"], val_df["category"]
test_n, test_cat = test_df["news"], test_df["category"]

## Cell 4: Feature Extraction and Classification Functions

### Handling both feature extraction and classification tasks. This function takes parameters: feature type, data, and categories, and dynamically adapts its operations based on the specified feature type. The features are selected, scaled, and used to train various classifiers. The best-performing classifier on the validation set is then chosen for final evaluation on the test set.

In [4]:
def feature_and_classify(feature_type, train_data, val_data, test_data, train_category, val_category, test_category):
    # Initialise variables
    train_feat, val_feat, test_feat = None, None, None

    # Feature extraction based on feature_type
    if feature_type == "absolute_word_freq":
        vectorizer = CountVectorizer(max_features=3000)
        train_feat = vectorizer.fit_transform(train_data).toarray()
        val_feat = vectorizer.transform(val_data).toarray()
        test_feat = vectorizer.transform(test_data).toarray()
    elif feature_type == "ngram":
        vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=3000)
        train_feat = vectorizer.fit_transform(train_data).toarray()
        val_feat = vectorizer.transform(val_data).toarray()
        test_feat = vectorizer.transform(test_data).toarray()
    elif feature_type == "tfidf":
        vectorizer = TfidfVectorizer(max_features=3000)
        train_feat = vectorizer.fit_transform(train_data).toarray()
        val_feat = vectorizer.transform(val_data).toarray()
        test_feat = vectorizer.transform(test_data).toarray()
    elif feature_type == "combined":
        vectorizer_absolute_word_freq = CountVectorizer(max_features=3000)
        vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), max_features=3000)
        vectorizer_tfidf = TfidfVectorizer(max_features=3000)

        train_feat_absolute_word_freq = vectorizer_absolute_word_freq.fit_transform(train_data).toarray()
        val_feat_absolute_word_freq = vectorizer_absolute_word_freq.transform(val_data).toarray()
        test_feat_absolute_word_freq = vectorizer_absolute_word_freq.transform(test_data).toarray()

        train_feat_ngram = vectorizer_ngram.fit_transform(train_data).toarray()
        val_feat_ngram = vectorizer_ngram.transform(val_data).toarray()
        test_feat_ngram = vectorizer_ngram.transform(test_data).toarray()

        train_feat_tfidf = vectorizer_tfidf.fit_transform(train_data).toarray()
        val_feat_tfidf = vectorizer_tfidf.transform(val_data).toarray()
        test_feat_tfidf = vectorizer_tfidf.transform(test_data).toarray()

        train_feat = np.concatenate((train_feat_absolute_word_freq, train_feat_ngram, train_feat_tfidf), axis=1)
        val_feat = np.concatenate((val_feat_absolute_word_freq, val_feat_ngram, val_feat_tfidf), axis=1)
        test_feat = np.concatenate((test_feat_absolute_word_freq, test_feat_ngram, test_feat_tfidf), axis=1)
    else:
        raise ValueError("Invalid feature_type. Supported values are: 'absolute_word_freq', 'ngram', 'tfidf', 'combined'.")

    # Feature selection
    k_best_selector = SelectKBest(chi2, k=200)
    train_feat = k_best_selector.fit_transform(train_feat, train_category)
    val_feat = k_best_selector.transform(val_feat)
    test_feat = k_best_selector.transform(test_feat)

    # Scale features if needed
    scaler = StandardScaler(with_mean=False)
    train_feat = scaler.fit_transform(train_feat)
    val_feat = scaler.transform(val_feat)
    test_feat = scaler.transform(test_feat)

    # Initialise classifiers
    svm_clf = SVC(C=0.1, kernel='rbf')
    rf_clf = RandomForestClassifier(random_state=42)
    nb_clf = MultinomialNB()
    lr_clf = LogisticRegression(max_iter=1000, random_state=42)

    classifiers = {'SVM': svm_clf, 'Random Forest': rf_clf, 'Naive Bayes': nb_clf, 'Logistic Regression': lr_clf}

    # Train and evaluate each classifier
    for name, clf in classifiers.items():
        clf.fit(train_feat, train_category)
        val_preds = clf.predict(val_feat)
        accuracy = accuracy_score(val_category, val_preds)
        conf_matrix = confusion_matrix(val_category, val_preds)
        classification_rep = classification_report(val_category, val_preds)
        print(f"\nEvaluation for {name} ({feature_type}):")
        print("Accuracy: {:.2f}%".format(accuracy * 100))

    # Select the best classifier based on validation performance
    best_clf_name = max(classifiers, key=lambda x: accuracy_score(val_category, classifiers[x].predict(val_feat)))
    best_clf = classifiers[best_clf_name]
    print(f"\nBest Classifier based on Validation Accuracy ({feature_type}): {best_clf_name}")

    # Predictions on the test set with the best classifier
    test_preds = best_clf.predict(test_feat)

    # Evaluate the best classifier on the test set
    test_accuracy = accuracy_score(test_category, test_preds)
    test_classification_rep = classification_report(test_category, test_preds, output_dict=True)

    # Print the evaluation metrics for the test set with the best classifier
    print(f"\nTest Set Evaluation with the Best Classifier ({feature_type}): {best_clf_name}")
    print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

    # Print macro precision, recall, and F1-score
    macro_precision = test_classification_rep['macro avg']['precision']
    macro_recall = test_classification_rep['macro avg']['recall']
    macro_f1_score = test_classification_rep['macro avg']['f1-score']

    print("Macro Precision: {:.2f}".format(macro_precision * 100))
    print("Macro Recall: {:.2f}".format(macro_recall * 100))
    print("Macro F1-score: {:.2f}".format(macro_f1_score * 100))

### Cell 5: Model Training and Evaluation

#### Here the function is called with the different choice of features

In [5]:
feature_and_classify("absolute_word_freq", train_n, val_n, test_n, train_cat, val_cat, test_cat)
feature_and_classify("ngram", train_n, val_n, test_n, train_cat, val_cat, test_cat)
feature_and_classify("tfidf", train_n, val_n, test_n, train_cat, val_cat, test_cat)
feature_and_classify("combined", train_n, val_n, test_n, train_cat, val_cat, test_cat)


Evaluation for SVM (absolute_word_freq):
Accuracy: 86.80%

Evaluation for Random Forest (absolute_word_freq):
Accuracy: 95.22%

Evaluation for Naive Bayes (absolute_word_freq):
Accuracy: 96.07%

Evaluation for Logistic Regression (absolute_word_freq):
Accuracy: 94.94%

Best Classifier based on Validation Accuracy (absolute_word_freq): Naive Bayes

Test Set Evaluation with the Best Classifier (absolute_word_freq): Naive Bayes
Test Accuracy: 95.28%
Macro Precision: 95.26
Macro Recall: 95.29
Macro F1-score: 95.25

Evaluation for SVM (ngram):
Accuracy: 85.67%

Evaluation for Random Forest (ngram):
Accuracy: 94.94%

Evaluation for Naive Bayes (ngram):
Accuracy: 95.79%

Evaluation for Logistic Regression (ngram):
Accuracy: 95.22%

Best Classifier based on Validation Accuracy (ngram): Naive Bayes

Test Set Evaluation with the Best Classifier (ngram): Naive Bayes
Test Accuracy: 96.18%
Macro Precision: 96.13
Macro Recall: 96.16
Macro F1-score: 96.12

Evaluation for SVM (tfidf):
Accuracy: 87.92