In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import transformers
transformers.is_tf_available = lambda: False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# json to csv cell 
import json
import csv
 
with open('cnn_articles.json') as json_file:
    data = json.load(json_file)
 
articles = data['articles']
 
data_file = open('articles.csv', 'w')

csv_writer = csv.writer(data_file)

count = 0
 
for article in articles:
    if count == 0:
 
        header = article.keys()
        csv_writer.writerow(header)
        count += 1
 
    csv_writer.writerow(article.values())
 
data_file.close()

In [3]:
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# preprocess data for modle 
file_path = "articles.csv"
articles_df = pd.read_csv(file_path)

categories = ['Politics', 'Business', 'Health', 'Science', 'Technology', 
              'Sports', 'Entertainment', 'World', 'Society', 'Everyday']
category_to_index = {category: idx for idx, category in enumerate(categories)}

def multi_hot_encode(categories_str):
    label_vector = np.zeros(len(categories), dtype=int)
    if isinstance(categories_str, str): 
        for cat in categories_str.split(','):
            cat = cat.strip() 
            if cat in category_to_index:  
                label_vector[category_to_index[cat]] = 1
    return label_vector

# multi hot encoder for current labels 
articles_df['multi_hot_labels'] = articles_df['category'].apply(multi_hot_encode)

print(articles_df)

                                              headline  \
0    The president-elect has railed against the pol...   
1    6 key lines from Trump’s Sunday speech to cons...   
2    Why Ivanka Trump left politics and isn’t comin...   
3    Manchin torches Democrats on the way out the door   
4    A picture is emerging of Resistance 2.0 as lib...   
..                                                 ...   
150  Meet the husband-wife legal team representing ...   
151  Some inmates seeking education behind bars fac...   
152  Costco is pushing back — hard — against the an...   
153  Christmas dinner injury forces golf’s World No...   
154  Why Nefertiti still inspires, 3,300 years afte...   

                                                  link  \
0    https://www.cnn.com/2024/12/22/politics/birthr...   
1    https://www.cnn.com/2024/12/22/politics/trump-...   
2    https://www.cnn.com/2024/12/22/politics/ivanka...   
3    https://www.cnn.com/2024/12/22/politics/joe-ma...   
4    https://

In [4]:
# tokenizer cell 
from transformers import BertTokenizer
import torch
from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 256

def tokenize_text(text):
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt' 
    )
    return encoding['input_ids'][0], encoding['attention_mask'][0]

# Apply tokenization
articles_df['tokenized'] = articles_df['content'].apply(tokenize_text)
print(articles_df)

input_ids = torch.stack([item[0] for item in articles_df['tokenized']])
attention_masks = torch.stack([item[1] for item in articles_df['tokenized']])

labels = torch.tensor(np.stack(articles_df['multi_hot_labels'])).float()

print(input_ids)
print(attention_masks)
print(labels)


train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(
    input_ids, labels, attention_masks, test_size=0.1, random_state=42
)

                                              headline  \
0    The president-elect has railed against the pol...   
1    6 key lines from Trump’s Sunday speech to cons...   
2    Why Ivanka Trump left politics and isn’t comin...   
3    Manchin torches Democrats on the way out the door   
4    A picture is emerging of Resistance 2.0 as lib...   
..                                                 ...   
150  Meet the husband-wife legal team representing ...   
151  Some inmates seeking education behind bars fac...   
152  Costco is pushing back — hard — against the an...   
153  Christmas dinner injury forces golf’s World No...   
154  Why Nefertiti still inspires, 3,300 years afte...   

                                                  link  \
0    https://www.cnn.com/2024/12/22/politics/birthr...   
1    https://www.cnn.com/2024/12/22/politics/trump-...   
2    https://www.cnn.com/2024/12/22/politics/ivanka...   
3    https://www.cnn.com/2024/12/22/politics/joe-ma...   
4    https://

In [5]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "input_ids": train_inputs.numpy(),
    "attention_mask": train_masks.numpy(),
    "labels": train_labels.numpy()
})

val_dataset = Dataset.from_dict({
    "input_ids": val_inputs.numpy(),
    "attention_mask": val_masks.numpy(),
    "labels": val_labels.numpy()
})

In [6]:
# load in the model 
from transformers import AutoModelForSequenceClassification

import transformers
print("Is PyTorch available:", transformers.is_torch_available())
print("Is TensorFlow available:", transformers.is_tf_available())

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(categories),
    problem_type="multi_label_classification"
)

print("Model framework:", model.framework)

Is PyTorch available: True
Is TensorFlow available: False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model framework: pt


In [7]:
# metrics stuff 
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.3).int().numpy()
    
    f1 = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "f1": f1
    }

In [8]:
from transformers import TrainingArguments

print("Model framework:", model.framework)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=51,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


Model framework: pt


In [9]:
# train model cell
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.616583,0.0,0.266366
2,No log,0.555228,0.0,0.274441
3,No log,0.508953,0.0,0.222407
4,No log,0.473552,0.0,0.197239
5,No log,0.447118,0.0,0.221866
6,No log,0.427085,0.1875,0.167464
7,No log,0.412349,0.1875,0.150649
8,No log,0.402306,0.125,0.209091
9,No log,0.392939,0.125,0.25
10,No log,0.383755,0.1875,0.290909


TrainOutput(global_step=459, training_loss=0.2869148669938896, metrics={'train_runtime': 645.7646, 'train_samples_per_second': 10.978, 'train_steps_per_second': 0.711, 'total_flos': 932664123030528.0, 'train_loss': 0.2869148669938896, 'epoch': 51.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.29107508063316345,
 'eval_accuracy': 0.4375,
 'eval_f1': 0.5357142857142857,
 'eval_runtime': 0.3576,
 'eval_samples_per_second': 44.747,
 'eval_steps_per_second': 2.797,
 'epoch': 51.0}

In [11]:
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json')