In [1]:
import pandas as pd
import numpy as np

In [35]:
train_set = pd.read_csv('dataset/train_processed.csv')
train_set.head()

Unnamed: 0,Text,label,clean_text,label_num
0,WASHINGTON (Reuters) - The head of a conservat...,Real,washington reuters head conservative republica...,1
1,WASHINGTON (Reuters) - Transgender people will...,Real,washington reuters transgender people allowed ...,1
2,WASHINGTON (Reuters) - The special counsel inv...,Real,washington reuters special counsel investigati...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real,washington reuters trump campaign adviser geor...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real,seattlewashington reuters president donald tru...,1


In [4]:
test_set = pd.read_csv('dataset/test_processed.csv')
test_set.head()

Unnamed: 0,Text,label,clean_text,label_num
0,"((In March 30 item, corrects spelling of Kisl...",Real,march 30 item corrects spelling kislyak paragr...,1
1,((Refiles December 15 story to clarify areas ...,Real,refiles december 15 story clarify area control...,1
2,((This Dec. 5 story corrects year in 2nd para...,Real,dec 5 story corrects year 2nd paragraph 2011 2...,1
3,((This Dec. 9 story corrects year in 2nd para...,Real,dec 9 story corrects year 2nd paragraph 2011 2...,1
4,((This December 4 story has been corrected to...,Real,december 4 story corrected change last year 20...,1


In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [30]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from tqdm import tqdm

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [18]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = [str(text) for text in texts.tolist() if str(text).strip() != '']
        self.labels = labels.tolist()
        
        # Tokenizer
        self.encodings = tokenizer(
            self.texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        
        self.labels = torch.tensor(self.labels)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }
    
    def __len__(self):
        return len(self.labels)

In [19]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_set["clean_text"],
    train_set["label_num"],
    test_size=0.1,
    random_state=42
)

In [20]:
train_texts.shape, val_texts.shape

((74628,), (8292,))

In [None]:
# Change to torch Dataset Fomat
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)
test_dataset = NewsDataset(test_set["clean_text"], test_set["label_num"], tokenizer)

In [22]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [27]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 3



In [28]:
for epoch in range(epochs):
    # train
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_train_loss = total_loss / len(train_loader)
    
    # evaluate
    model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Validation]", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            
            val_preds.extend(preds)
            val_true.extend(labels)
    
    val_acc = accuracy_score(val_true, val_preds)
    val_f1 = f1_score(val_true, val_preds)
    
    print(f"\nEpoch {epoch+1}")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Validation F1: {val_f1:.4f}\n")

Epoch 1 [Train]:   0%|          | 0/4665 [00:00<?, ?it/s]

                                                                                 


Epoch 1
Train Loss: 0.0004
Validation Accuracy: 0.9998
Validation F1: 0.9998



                                                                                 


Epoch 2
Train Loss: 0.0000
Validation Accuracy: 1.0000
Validation F1: 1.0000



                                                                                 


Epoch 3
Train Loss: 0.0000
Validation Accuracy: 0.9996
Validation F1: 0.9997





In [None]:
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        
        test_preds.extend(preds)
        test_true.extend(labels)

Testing:   0%|          | 0/2416 [00:00<?, ?it/s]

Testing: 100%|██████████| 2416/2416 [03:04<00:00, 13.11it/s]


Final Test Results:
Accuracy: 0.9960
F1 Score: 0.9963





In [32]:
final_acc = accuracy_score(test_true, test_preds)
final_f1 = f1_score(test_true, test_preds)
final_pricision = precision_score(test_true, test_preds)
final_recall = recall_score(test_true, test_preds)

print("\nFinal Test Results:")
print(f"Accuracy: {final_acc:.4f}")
print(f"Pricision: {final_pricision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1:.4f}")


Final Test Results:
Accuracy: 0.9960
Pricision: 1.0000
Recall: 0.9927
F1 Score: 0.9963
