In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import transformers
transformers.is_tf_available = lambda: False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

df = pd.read_json("hf://datasets/textminr/mn-ds/train.jsonl", lines=True)

print(df.head())

   data_id                                                 id       date  \
0     1809  abcnews--2019-10-31--Virginia mom charged with... 2019-10-31   
1     1980  abcnews--2019-11-07--2 escaped murder suspects... 2019-11-07   
2     1995  abcnews--2019-11-07--Family turns in escaped b... 2019-11-07   
3     2740  abcnews--2019-12-02--Mother charged with murde... 2019-12-02   
4     7038  ageofautism--2019-04-12--Physician Father and ... 2019-04-12   

        source                                              title  \
0      abcnews  Virginia mom charged with murder in 2-year-old...   
1      abcnews  2 escaped murder suspects arrested at US-Mexic...   
2      abcnews  Family turns in escaped boy, 13, suspected in ...   
3      abcnews  Mother charged with murder in deaths of 2 youn...   
4  ageofautism  Physician, Father and Caretaker of 29 Year Old...   

                                             content         author  \
0  The Virginia woman whose 2-year-old son was fo...     

In [3]:
# encode current label for model
labels = df['category_level_1'].unique()
categories = list(labels) 
print("Categories:", categories)

category_to_index = {category: idx for idx, category in enumerate(categories)}
print("Category to Index Mapping:", category_to_index)

def encode_label(category):
    return category_to_index[category]

df['encoded_label'] = df['category_level_1'].apply(encode_label)

print(df[['category_level_1', 'encoded_label']])


Categories: ['crime, law and justice', 'arts, culture, entertainment and media', 'economy, business and finance', 'disaster, accident and emergency incident', 'environment', 'education', 'health', 'human interest', 'lifestyle and leisure', 'politics', 'labour', 'religion and belief', 'science and technology', 'society', 'sport', 'conflict, war and peace', 'weather']
Category to Index Mapping: {'crime, law and justice': 0, 'arts, culture, entertainment and media': 1, 'economy, business and finance': 2, 'disaster, accident and emergency incident': 3, 'environment': 4, 'education': 5, 'health': 6, 'human interest': 7, 'lifestyle and leisure': 8, 'politics': 9, 'labour': 10, 'religion and belief': 11, 'science and technology': 12, 'society': 13, 'sport': 14, 'conflict, war and peace': 15, 'weather': 16}
              category_level_1  encoded_label
0       crime, law and justice              0
1       crime, law and justice              0
2       crime, law and justice              0
3    

In [4]:
# preprocess data for model 
print(df.columns) 
print(df['content'].head()) 

df = df.dropna(subset=['content'])
df = df[df['content'].str.strip() != ''] 

Index(['data_id', 'id', 'date', 'source', 'title', 'content', 'author', 'url',
       'published', 'published_utc', 'collection_utc', 'category_level_1',
       'category_level_2', 'encoded_label'],
      dtype='object')
0    The Virginia woman whose 2-year-old son was fo...
1    Authorities are trying to determine if anyone ...
2    A 13-year-old suspect in a double homicide who...
3    The mother of two young children found hanging...
4    "One family member said Derek “can be violent ...
Name: content, dtype: object


In [5]:
# tokenizer cell 
from transformers import BertTokenizer
import torch
from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 256

def tokenize_text(text):
    if not isinstance(text, str) or text.strip() == '':
        raise ValueError("Input text must be a non-empty string.")
    
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt' 
    )
    return encoding['input_ids'][0], encoding['attention_mask'][0]

df['tokenized'] = df['content'].apply(tokenize_text)
print(df)

input_ids = torch.stack([item[0] for item in df['tokenized']])
attention_masks = torch.stack([item[1] for item in df['tokenized']])

labels = torch.tensor(df['encoded_label'].values)


print(input_ids)
print(attention_masks)
print(labels)


train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(
    input_ids, labels, attention_masks, test_size=0.2, random_state=42
)

       data_id                                                 id       date  \
0         1809  abcnews--2019-10-31--Virginia mom charged with... 2019-10-31   
1         1980  abcnews--2019-11-07--2 escaped murder suspects... 2019-11-07   
2         1995  abcnews--2019-11-07--Family turns in escaped b... 2019-11-07   
3         2740  abcnews--2019-12-02--Mother charged with murde... 2019-12-02   
4         7038  ageofautism--2019-04-12--Physician Father and ... 2019-04-12   
...        ...                                                ...        ...   
10912   907640  therussophileorg--2019-12-15--Iran to build ov... 2019-12-15   
10913   892720  therussophileorg--2019-10-12--Ukraine has no m... 2019-10-12   
10914   870499  therussophileorg--2019-07-06--Irans contributi... 2019-07-06   
10915   887334  therussophileorg--2019-09-25--Iraqi president ... 2019-09-25   
10916   885988  therussophileorg--2019-09-20--Russia expects t... 2019-09-20   

                 source                

In [6]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "input_ids": train_inputs.numpy(),
    "attention_mask": train_masks.numpy(),
    "labels": train_labels.numpy()
})

val_dataset = Dataset.from_dict({
    "input_ids": val_inputs.numpy(),
    "attention_mask": val_masks.numpy(),
    "labels": val_labels.numpy()
})

In [7]:
# load in the model 
from transformers import AutoModelForSequenceClassification

import transformers
print("Is PyTorch available:", transformers.is_torch_available())
print("Is TensorFlow available:", transformers.is_tf_available())

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(categories),
    problem_type="single_label_classification"
)

print("Model framework:", model.framework)

Is PyTorch available: True
Is TensorFlow available: False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model framework: pt


In [8]:
# metrics stuff 
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1) 
    
    f1 = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "f1": f1
    }


In [9]:
from transformers import TrainingArguments

print("Model framework:", model.framework)

training_args = TrainingArguments(
    output_dir="./results2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


Model framework: pt


In [None]:
# train model cell 
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.2648,0.840766,0.759615,0.758857
2,0.6601,0.864545,0.761447,0.761267


In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained("./fine_tuned_bert2")
tokenizer.save_pretrained("./fine_tuned_bert2")