## Data loading

In [271]:
import os
import pandas as pd

while 'news_data' not in os.listdir():
    os.chdir('..')

file_names = [os.path.join('news_data', file_name) for file_name in os.listdir('news_data')]
dfs = []
for file_name in os.listdir('news_data'):
    dfs.append(pd.read_csv(os.path.join('news_data', file_name), index_col = 0))

df = pd.concat(dfs)
df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"After Musk tweets 'Use Signal', unrelated stoc...",After Elon Musk endorsed encrypted messaging a...,technology
1,Donald Trump permanently banned from Twitter,Twitter has permanently banned US President Do...,technology
2,Man finds father's image on Google Earth 7 yea...,"A man in Japan found an image of his father, w...",technology
3,Bad sign: Mexican Prez after Trump's social me...,Following the suspension of US President Donal...,technology
4,Signal app registration system crashes after M...,Private encrypted messaging app Signal tweeted...,technology


In [272]:
df['joined'] = df.apply(lambda row: row['news_headline'] + '. ' + row['news_article'], axis = 1)
df.head()

Unnamed: 0,news_headline,news_article,news_category,joined
0,"After Musk tweets 'Use Signal', unrelated stoc...",After Elon Musk endorsed encrypted messaging a...,technology,"After Musk tweets 'Use Signal', unrelated stoc..."
1,Donald Trump permanently banned from Twitter,Twitter has permanently banned US President Do...,technology,Donald Trump permanently banned from Twitter. ...
2,Man finds father's image on Google Earth 7 yea...,"A man in Japan found an image of his father, w...",technology,Man finds father's image on Google Earth 7 yea...
3,Bad sign: Mexican Prez after Trump's social me...,Following the suspension of US President Donal...,technology,Bad sign: Mexican Prez after Trump's social me...
4,Signal app registration system crashes after M...,Private encrypted messaging app Signal tweeted...,technology,Signal app registration system crashes after M...


In [273]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder().fit(np.unique(df['news_category']))
labels_num = enc.transform(df['news_category'])

df['news_category_num'] = labels_num

df.head()

Unnamed: 0,news_headline,news_article,news_category,joined,news_category_num
0,"After Musk tweets 'Use Signal', unrelated stoc...",After Elon Musk endorsed encrypted messaging a...,technology,"After Musk tweets 'Use Signal', unrelated stoc...",5
1,Donald Trump permanently banned from Twitter,Twitter has permanently banned US President Do...,technology,Donald Trump permanently banned from Twitter. ...,5
2,Man finds father's image on Google Earth 7 yea...,"A man in Japan found an image of his father, w...",technology,Man finds father's image on Google Earth 7 yea...,5
3,Bad sign: Mexican Prez after Trump's social me...,Following the suspension of US President Donal...,technology,Bad sign: Mexican Prez after Trump's social me...,5
4,Signal app registration system crashes after M...,Private encrypted messaging app Signal tweeted...,technology,Signal app registration system crashes after M...,5


## Data split into train, val & test

In [274]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

df_shuffled = shuffle(df, random_state=0)

def train_val_test_split(X, y, val_size=0.2, test_size=0.1):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=0)

    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_shuffled[['news_headline', 'news_article', 'joined']], df_shuffled['news_category_num'])

## Model inicialization (without fine-tuning)

Fine-tuning will be soon ;)

In [275]:
from transformers import pipeline

generator = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-1")

## Model inference + testing

Info before you run: inference of 1 sample on MacBook with M1, 8GB RAM takes around 3 seconds.

Feel free to change the number of k_samples!

In [122]:
import numpy as np
from sklearn.metrics import classification_report

k_samples = 10

# Here I'm using just news_article
results = generator(sequences=list(X_test['news_article'][:k_samples]), candidate_labels = list(np.unique(df_shuffled['news_category'])))

y_pred = [result['labels'][0] for result in results]
y_true = y_test[:k_samples]

print(classification_report(y_true, y_pred))

               precision    recall  f1-score   support

entertainment       0.50      1.00      0.67         1
     politics       0.50      1.00      0.67         1
      science       1.00      1.00      1.00         2
       sports       1.00      1.00      1.00         1
   technology       1.00      1.00      1.00         2
        world       1.00      0.33      0.50         3

     accuracy                           0.80        10
    macro avg       0.83      0.89      0.81        10
 weighted avg       0.90      0.80      0.78        10



In [123]:
X_test['headline_article'] = X_test.apply(lambda row: row['news_headline'] + '. ' + row['news_article'], axis = 1)

k_samples = 10

# Here I'm using just news_article
results = generator(sequences=list(X_test['headline_article'][:k_samples]), candidate_labels = list(np.unique(df_shuffled['news_category'])))

y_pred = [result['labels'][0] for result in results]
y_true = y_test[:k_samples]

print(classification_report(y_true, y_pred))

               precision    recall  f1-score   support

entertainment       1.00      1.00      1.00         1
     politics       1.00      1.00      1.00         1
      science       1.00      1.00      1.00         2
       sports       1.00      1.00      1.00         1
   technology       1.00      0.50      0.67         2
        world       0.75      1.00      0.86         3

     accuracy                           0.90        10
    macro avg       0.96      0.92      0.92        10
 weighted avg       0.93      0.90      0.89        10



## Preprocessing + Model fine-tuning

In [276]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [308]:
def hf_dataset(X, y):

    dataset = {
        'label': torch.tensor(list(y)),
        # 'text': torch.tensor(list(X))
    }

    dataset.update(tokenizer(list(X), padding=True, truncation=True, return_tensors="pt"))

    return Dataset.from_dict(dataset)

train_dataset = hf_dataset(X_train['joined'], y_train)
val_dataset = hf_dataset(X_val['joined'], y_val)
test_dataset = hf_dataset(X_test['joined'], y_test)

In [285]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8726
})

## BERT initialization

In [286]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

## Training BERT

In [287]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [288]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [289]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [293]:
train_dataset.select(range(100))

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [294]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.select(range(100)),
    eval_dataset=val_dataset.select(range(100)),
    compute_metrics=compute_metrics,
)

In [296]:
trainer.train()

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.753369152545929, 'eval_accuracy': 0.72, 'eval_runtime': 22.8713, 'eval_samples_per_second': 4.372, 'eval_steps_per_second': 0.568, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.753369152545929, 'eval_accuracy': 0.72, 'eval_runtime': 22.5547, 'eval_samples_per_second': 4.434, 'eval_steps_per_second': 0.576, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.753369152545929, 'eval_accuracy': 0.72, 'eval_runtime': 21.2766, 'eval_samples_per_second': 4.7, 'eval_steps_per_second': 0.611, 'epoch': 3.0}
{'train_runtime': 342.8229, 'train_samples_per_second': 0.875, 'train_steps_per_second': 0.114, 'train_loss': 0.7753731165176783, 'epoch': 3.0}


TrainOutput(global_step=39, training_loss=0.7753731165176783, metrics={'train_runtime': 342.8229, 'train_samples_per_second': 0.875, 'train_steps_per_second': 0.114, 'train_loss': 0.7753731165176783, 'epoch': 3.0})

## Inference and mini-testing

In [335]:
def inference_dataset(X):

    dataset = tokenizer(list(X), padding=True, truncation=True, return_tensors="pt")

    return dataset

In [336]:
k_samples = 10
 
dict_test_dataset = inference_dataset(X_test['joined'][:k_samples])

results = model(**dict_test_dataset).logits

y_pred = [torch.argmax(result) for result in results]
y_true = y_test[:k_samples]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         1
           4       1.00      1.00      1.00         1
           5       0.75      1.00      0.86         3
           6       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0.79      0.83      0.81        10
weighted avg       0.82      0.90      0.86        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
