#### Import needed libraries

In [10]:
import pandas as pd
import numpy as np
import spacy
import string
import sklearn
from spacy import displacy
from pathlib import Path
from spacy import displacy
import os
from collections import Counter
import nltk
# Download the stopwords and tokenizer from nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from google.colab import drive
drive.mount('/content/drive')


# Load the English language model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Downloading class dataset

In [2]:
# stored in my google drive
!unzip -q /content/drive/MyDrive/Colab\ Notebooks/datasets/BBC\ News\ Summary-kaggle-v2.zip

#### Importing data locally to panda frame

In [3]:
classes = os.listdir('/content/BBC News Summary/News Articles')
Articles_dir = '/content/BBC News Summary/News Articles/'

# initializing feature column data types
articles = []
file_arr = []
is_sport = []
is_biz = []
is_ent = []
is_politics = []
is_tech =[]
labels = []
for cls in classes:
    files = os.listdir(Articles_dir + cls)

    for file in files:
        article_file_path = Articles_dir + cls + '/' + file

        # Starting to append values for the different feature columns
        # extracting data from files
        try:
            with open (article_file_path,'r') as f:
                articles.append('.'.join([line.rstrip() for line in f.readlines()]))

            file_arr.append(cls + '/' + file)
            labels.append(cls)
            match cls:
              case "sport":
                is_sport.append(1)
                is_biz.append(0)
                is_ent.append(0)
                is_politics.append(0)
                is_tech.append(0)
              case "tech":
                is_sport.append(0)
                is_biz.append(0)
                is_ent.append(0)
                is_politics.append(0)
                is_tech.append(1)
              case "business":
                is_sport.append(0)
                is_biz.append(1)
                is_ent.append(0)
                is_politics.append(0)
                is_tech.append(0)
              case "entertainment":
                is_sport.append(0)
                is_biz.append(0)
                is_ent.append(1)
                is_politics.append(0)
                is_tech.append(0)
              case "politics":
                is_sport.append(0)
                is_biz.append(0)
                is_ent.append(0)
                is_politics.append(1)
                is_tech.append(0)
        except:
            pass

df = pd.DataFrame({'File_path':file_arr,'Articles text': articles, 'label': labels, 'Business': is_biz, 'Entertainment': is_ent, 'Politics': is_politics, 'Sport': is_sport, 'Tech': is_tech})
df.head()

Unnamed: 0,File_path,Articles text,label,Business,Entertainment,Politics,Sport,Tech
0,sport/487.txt,Officials respond in court row..Australian ten...,sport,0,0,0,1,0
1,sport/257.txt,Spain coach faces racism inquiry..Spain's Foot...,sport,0,0,0,1,0
2,sport/471.txt,Capriati out of Australian Open..Jennifer Capr...,sport,0,0,0,1,0
3,sport/252.txt,Ferguson rues failure to cut gap..Boss Sir Ale...,sport,0,0,0,1,0
4,sport/158.txt,Stam spices up Man Utd encounter..AC Milan def...,sport,0,0,0,1,0


## 2. Data Cleaning and Appending


Preprocessing data is essentially to getting optimal model results. I used a modular approach and created a function to process each "Articles text" entry in the data.  I first began with tokenizing the text which, I noticed, better helped identify sets of words and distinguish puntuation more properly. I followed by lowercasing all the words and removing the following: punctuation, stopwords, and duplicate words.

The process_text function returns the clean data as "processed_text" and gets added as a new data column. I also reused another function called get_token_count, to return and add a count of the tokens of the original Articles text and processed_text. This gave me great initial insight to see how much information was reduced, and to see how much was relevent.

In [47]:
def preprocess_text(text):
    # Convert text to lowercase
    words = word_tokenize(text)

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Remove punctuation from words
    words = [word for word in words if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Remove duplicate words
    unique_words = list(dict.fromkeys(words))

    # Join the words back into a string
    text = ' '.join(unique_words)

    return text

def get_token_count(text):
    words = word_tokenize(text)
    return len(words)


# Apply the function to the DataFrame column
df['processed_text'] = df['Articles text'].apply(preprocess_text)
df['processed_text_token_count'] = df['processed_text'].apply(get_token_count)
df['Articles_text_token_count'] = df['Articles text'].apply(get_token_count)

display(df.head())
print("sample of cleaned text \n", df['processed_text'][0])

Unnamed: 0,File_path,Articles text,label,Business,Entertainment,Politics,Sport,Tech,processed_text,processed_text_token_count,Articles_text_token_count
0,sport/487.txt,Officials respond in court row..Australian ten...,sport,0,0,0,1,0,officials respond court row australian tennis ...,80,247
1,sport/257.txt,Spain coach faces racism inquiry..Spain's Foot...,sport,0,0,0,1,0,spain coach faces racism inquiry football fede...,106,288
2,sport/471.txt,Capriati out of Australian Open..Jennifer Capr...,sport,0,0,0,1,0,capriati australian open jennifer become third...,57,146
3,sport/252.txt,Ferguson rues failure to cut gap..Boss Sir Ale...,sport,0,0,0,1,0,ferguson rues failure cut gap boss sir alex le...,78,220
4,sport/158.txt,Stam spices up Man Utd encounter..AC Milan def...,sport,0,0,0,1,0,stam spices man utd encounter ac milan defende...,146,447


'sample of cleaned text \n'

'officials respond court row australian tennis top official defended open courts melbourne park criticism playing surface lleyton hewitt said gutful trying persuade make faster australia geoff pollard rejected comments ask made last year knowledge spent substantial amount money modifications got past fourth round seven attempts earlier worthy prestigious tournament would play whatever order chance winning starts 17 january speeded since following complaints several players refused accept may bearing performances years proving versatile surfaces one beat roger federer davis cup 2003'

#### Exploratory Data Analysis

In [55]:
doc = nlp(df['processed_text'][0])

# produce spacy ent results
displacy.render(doc, style='ent', minify=True, jupyter=True)

options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro", "distance": 70}
displacy.render(doc, style="dep",minify=True,jupyter=True, options=options)


In [17]:
biz_count = df['Business'].value_counts().get(1,0)
ent_count = df['Entertainment'].value_counts().get(1,0)
pol_count = df['Politics'].value_counts().get(1,0)
sport_count = df['Sport'].value_counts().get(1,0)
tech_count = df['Tech'].value_counts().get(1,0)

print(f'Business: {biz_count}')
print(f'Entertainment: {ent_count}')
print(f'Politics: {pol_count}')
print(f'Sport: {sport_count}')
print(f'Tech: {tech_count}')

Business: 510
Entertainment: 386
Politics: 417
Sport: 510
Tech: 401


## Train, test split

In [23]:
X = df['processed_text'].values
y = df[['Business', 'Entertainment', 'Politics', 'Sport', 'Tech']].values

# # Step 1: Convert multi-labels to binary matrix
# mlb = MultiLabelBinarizer()
# y = mlb.fit_transform(y.values)

X = np.array(X).reshape(-1, 1)


# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("Label distribution in test set:", np.sum(y_test, axis=0))
print("Label distribution in entire dataset:", np.sum(y, axis=0))

display(y_test)

# Reshape to reshape nparrays from 2d to 1d for vectorization
X_train = X_train.reshape(-1)
X_test = X_test.reshape(-1)


Label distribution in test set: [129  92  91 143 101]
Label distribution in entire dataset: [510 386 417 510 401]


array([[0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1]])

#### Apply TF-IDF Vectorization

In [24]:
# Step 3:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

display(X_test_tfidf)

<556x24922 sparse matrix of type '<class 'numpy.float64'>'
	with 80786 stored elements in Compressed Sparse Row format>

#### Apply SVM

OneVsRestClassifier is a simple and effective method for adapting binary classifiers for multi-class or multi-label problems, but it may not always be the most efficient in terms of computation and memory usage, especially when the number of classes is large.

In [25]:
# Step 4: Build and evaluate the Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear', random_state=42, probability=True)

#
clf = OneVsRestClassifier(svm_classifier)
# Fit the classifier
clf.fit(X_train_tfidf, y_train)
# Predictions on the test set
y_pred_tfidf = clf.predict(X_test_tfidf)

Precision: 0.9866542121150914


## Evaluate Metrics

The predict_proba() function in sklearn.svm.svc works by internally using the decision function to calculate the probabilities. The decision function calculates a score for each class, and the class with the highest score is predicted as the output.

In [40]:
# Evaluate using classification metrics
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='weighted')
f1_score_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')
roc_auc_tfidf = roc_auc_score(y_test, clf.predict_proba(X_test_tfidf))

# Step 5: Display the classification metrics
print("Results using TF-IDF Vectorization and Support Vector Machine (SVM) Classifier:")
print(f"Accuracy: {accuracy_tfidf:.2}")
print(f"Precision: {precision_tfidf:.2}")
print(f"Recall: {recall_tfidf:.2}")
print(f"F1 Score: {f1_score_tfidf:.2}")
print(f"AUC-ROC Score: {roc_auc_tfidf:.2}")


Results using TF-IDF Vectorization and Support Vector Machine (SVM) Classifier:
Accuracy:0.93
Precision: 0.9866542121150914
Recall: 0.9388489208633094
F1 Score: 0.96169317663468
AUC-ROC Score: 0.9979131111435489


In [38]:
for i, class_label in enumerate(['Business', 'Entertainment', 'Politics', 'Sport', 'Tech']):
    print(f"Confusion matrix for {class_label}:")
    print(confusion_matrix(y_test[:, i], y_pred_tfidf[:, i]))

Confusion matrix for Business:
[[425   2]
 [ 15 114]]
Confusion matrix for Entertainment:
[[463   1]
 [  5  87]]
Confusion matrix for Politics:
[[464   1]
 [  9  82]]
Confusion matrix for Sport:
[[412   1]
 [  0 143]]
Confusion matrix for Tech:
[[453   2]
 [  5  96]]


In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_tfidf, target_names=['Business', 'Entertainment', 'Politics', 'Sport', 'Tech']))

               precision    recall  f1-score   support

     Business       0.98      0.88      0.93       129
Entertainment       0.99      0.95      0.97        92
     Politics       0.99      0.90      0.94        91
        Sport       0.99      1.00      1.00       143
         Tech       0.98      0.95      0.96       101

    micro avg       0.99      0.94      0.96       556
    macro avg       0.99      0.94      0.96       556
 weighted avg       0.99      0.94      0.96       556
  samples avg       0.94      0.94      0.94       556



  _warn_prf(average, modifier, msg_start, len(result))


References:

https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html


#### Create and output visualization (SVGs)

In [None]:
# Running code on entire dataset
options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro"}

for ind in tmp_df.index:
  words = tmp_df["claim_wo_stop_words"][ind]
  listToStr = ' '.join([str(elem) for elem in words])
  doc = nlp(listToStr)

  # produce spacy ent results
  svg = displacy.render(doc, style='ent', minify=True, jupyter=False)
  file_name = "claim-" + str(ind) + "-ent.svg"
  dir_path= Path("~/visualResults/ent/")
  output_path = Path("visualResults/ent/" + file_name)
  isExist = os.path.exists(dir_path)
  output_path.open("w", encoding="utf-8").write(svg)

  svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options=options)
  file_name = "claim-" + str(ind) + "-dep.svg"
  dir_path= Path("~/visualResults/dep/")
  output_path = Path("visualResults/dep/" + file_name)
  output_path.open("w", encoding="utf-8").write(svg)



In [None]:
# example of name entity recognition
words = tmp_df["claim"][1534]
doc = nlp(words)
svg = displacy.render(doc, style='ent', jupyter=False)
file_name = "aclaim-solo-test-dep.svg"
dir_path= Path("~/visualResults/ent/")
output_path = Path("visualResults/ent/" + file_name)
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style='ent', jupyter=True)


In [None]:
# example of Visualize POS Tags
options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro"}
print(doc)
svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options=options)
file_name = "aclaim-solo-test-dep.svg"
dir_path= Path("~/visualResults/dep/")
output_path = Path("visualResults/dep/" + file_name)
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style='dep', jupyter=True)

Over the last decade, heatwaves are five times more likely than if there had been no global warming.


In [None]:
!jupyter nbconvert --to pdf

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr