<a href="https://colab.research.google.com/github/chutki26/dissertation-newscrawler/blob/main/ClassifyAllArticles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## try the drive .mount :https://stackoverflow.com/questions/69869534/files-and-folders-in-google-colab

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import json
from tqdm.auto import tqdm

In [None]:
import requests
import pickle

In [None]:
# urls
training_data_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/training_articles.json"
ap_news_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/ap_within_date.json"
bbc_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/bbc_within_date.json"
cnn_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/cnn_within_date.json"
dailymail_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/dailymail_within_date.json"
fox_news_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/foxnews_within_date.json"
guardian_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/guardian_within_date.json"
hindustantimes_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/hindustantimes_within_date.json"
independent_uk_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/independentuk_within_date.json"
india_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/india_within_date.json"
indianexpress_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/indianexpress_within_date.json"
nbc_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/nbc_within_date.json"
news18_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/news18_within_date.json"
newsweek_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/newsweek_within_date.json"
nypost_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/nypost_within_date.json"
usatoday_url = "https://raw.githubusercontent.com/chutki26/dissertation-data/refs/heads/main/usatoday_within_date.json"

article_urls = [ap_news_url, bbc_url, cnn_url, dailymail_url, fox_news_url, guardian_url, hindustantimes_url, independent_uk_url, india_url, indianexpress_url, nbc_url, news18_url, newsweek_url, nypost_url, usatoday_url]

In [None]:
for url in article_urls:
  if requests.get(url).status_code == 404:
    print(url)

In [None]:
df_training = pd.read_json(training_data_url)

In [None]:
df_articles = pd.concat([pd.read_json(url) for url in article_urls], ignore_index=True)

In [None]:
# create full text column

df_training['description'] = df_training['description'].fillna("")
df_training['title'] = df_training['title'].fillna("")
df_training['text'] = df_training['text'].fillna("")

df_training['full_text'] = df_training['title'] + " " + df_training['description'] + " " + df_training['text']

In [None]:
# convert "true" and "false" to 0 or 1

def convert_to_binary(value):
    if value == "true":
        return 1
    else:
        return 0

df_training['label'] = df_training['relevant'].map(convert_to_binary)

In [None]:
# create full text for articles
df_articles['description'] = df_articles['description'].fillna("")
df_articles['title'] = df_articles['title'].fillna("")
df_articles['text'] = df_articles['text'].fillna("")

df_articles['full_text'] = df_articles['title'] + " " + df_articles['description'] + " " + df_articles['text']

In [None]:
# pickle both training and article dataframes

df_training.to_pickle('df_training.pkl')
df_articles.to_pickle('df_articles.pkl')

In [None]:
# set up cross validation
# stratified k fold ensures that class (im)balance is kept when splitting data
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
def get_tokenizer():
    return AutoTokenizer.from_pretrained("bert-base-uncased")

def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )

In [None]:
# create dataset class
class NewsDataset(torch.utils.data.Dataset):
  def __init__(self, texts, labels, tokenizer):
        assert len(texts) == len(labels), "Texts and labels must have the same length"
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

  def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Remove the batch dimension added by the tokenizer
        item = {key: val[0] for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
    return len(self.texts)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Cross-validation loop
best_f1_score = 0
best_model = None

for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(df_training, df_training['label']), total=n_splits)):

    # fold 1 seemed to be an outlier so being skipped here.
    if fold == 0:
      continue

    print(f"\nFold {fold+1}/{n_splits}")

    # Split data
    train_df = df_training.iloc[train_idx]
    val_df = df_training.iloc[val_idx]

    # Initialize tokenizer and model for this fold
    tokenizer = get_tokenizer()
    model = get_model()

    # Create datasets
    train_dataset = NewsDataset(
        train_df['full_text'].tolist(),
        train_df['label'].tolist(),
        tokenizer
    )

    val_dataset = NewsDataset(
        val_df['full_text'].tolist(),
        val_df['label'].tolist(),
        tokenizer
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/fold-{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train model
    trainer.train()

    # Evaluate
    eval_results = trainer.evaluate()

    # Get predictions for this fold
    eval_f1 = eval_results['eval_f1']

    if eval_f1 > best_f1_score:
        best_f1_score = eval_f1
        best_model = model

  0%|          | 0/5 [00:00<?, ?it/s]


Fold 2/5


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6033,0.595556,0.7,0.0,0.0,0.0
2,0.5388,0.39648,0.9,0.785714,0.916667,0.846154
3,0.2822,0.467916,0.825,1.0,0.416667,0.588235


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5799,0.594169,0.7,0.0,0.0,0.0
2,0.4295,0.519367,0.675,0.466667,0.583333,0.518519
3,0.2496,0.425575,0.8,0.611111,0.916667,0.733333


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5961,0.588494,0.7,0.0,0.0,0.0
2,0.5096,0.357185,0.95,1.0,0.833333,0.909091
3,0.2687,0.231449,0.95,1.0,0.833333,0.909091


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5821,0.597044,0.7,0.0,0.0,0.0
2,0.5954,0.500183,0.9,1.0,0.666667,0.8
3,0.3727,0.263128,0.9,0.9,0.75,0.818182


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
best_f1_score

0.9090909090909091

In [None]:
best_model.save_pretrained('model', from_pt=True)

In [None]:
model_to_use = AutoModelForSequenceClassification.from_pretrained('model')

In [None]:
# classify articles and pickle dataframe
tokenizer = get_tokenizer()

test_article = df_articles.iloc[5]



inputs = tokenizer(test_article['full_text'],  padding="max_length", truncation=True, max_length=512, return_tensors="pt")

with torch.no_grad():
  logits = model_to_use(**inputs).logits

predicted_class_id = logits.argmax().item()
model_to_use.config.id2label[predicted_class_id]

'LABEL_1'

In [None]:
tokenizer = get_tokenizer()

predicted_labels = []


for i in range(len(df_articles)):
  article = df_articles.iloc[i]

  inputs = tokenizer(article['full_text'],  padding="max_length", truncation=True, max_length=512, return_tensors="pt")

  with torch.no_grad():
    logits = model_to_use(**inputs).logits

  predicted_class_id = logits.argmax().item()
  predicted_labels.append(model_to_use.config.id2label[predicted_class_id])

df_articles['predicted_label'] = predicted_labels
df_articles.to_json(path = 'classified_articles.json', orient="index")

KeyboardInterrupt: 

In [None]:
df_apnews = df_articles[df_articles['source_domain'] == 'apnews.com']

In [None]:
tokenizer = get_tokenizer()

predicted_labels = []


for i in tqdm(range(len(df_apnews))):
  article = df_apnews.iloc[i]

  inputs = tokenizer(article['full_text'],  padding="max_length", truncation=True, max_length=512, return_tensors="pt")

  with torch.no_grad():
    logits = model_to_use(**inputs).logits

  predicted_class_id = logits.argmax().item()
  predicted_labels.append(model_to_use.config.id2label[predicted_class_id])

df_apnews['predicted_label'] = predicted_labels
df_apnews.to_pickle("ap_news.pkl")

  0%|          | 0/5953 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_apnews['predicted_label'] = predicted_labels


In [None]:
df_ap_news = pd.read_pickle("ap_news.pkl")

In [None]:
df_ap_news['predicted_label'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
predicted_label,Unnamed: 1_level_1
LABEL_0,0.680329
LABEL_1,0.319671
