In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

/kaggle/input/fake-news-detection-dataset/Task3_english_training.csv
/kaggle/input/fake-news-detection-dataset/Task3_english_dev.csv
/kaggle/input/fake-news-detection-dataset/English_data_test_release_with_rating.csv


In [2]:
# Load the datasets
data_train = pd.read_csv('/kaggle/input/fake-news-detection-dataset/Task3_english_training.csv')
data_validation = pd.read_csv('/kaggle/input/fake-news-detection-dataset/Task3_english_dev.csv')
data_test = pd.read_csv('/kaggle/input/fake-news-detection-dataset/English_data_test_release_with_rating.csv')

# Convert 'our rating' column to lowercase
data_train['our rating'] = data_train['our rating'].str.lower()
data_validation['our rating'] = data_validation['our rating'].str.lower()
data_test['our rating'] = data_test['our rating'].str.lower()

def makeLabelsUnique(labels):
    unique_labels = set(labels)
    label_map = {label: idx for idx, label in enumerate(unique_labels)}
    labels = [label_map[label] for label in labels]
    return labels

data_train['our rating'] = makeLabelsUnique(data_train['our rating'].values)
data_validation['our rating'] = makeLabelsUnique(data_validation['our rating'].values)
data_test['our rating'] = makeLabelsUnique(data_test['our rating'].values)

train_df = data_train[['text', 'title', 'our rating']].copy()
val_df = data_validation[['text', 'title', 'our rating']].copy()

train_dataset = train_df.reset_index(drop=True)
val_dataset = val_df.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))

TRAIN Dataset: (900, 3)
VAL Dataset: (364, 3)


In [3]:
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import nn
import torch

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data['our rating']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

train_dataset = CustomDataset(train_dataset, tokenizer, 512)
val_dataset = CustomDataset(val_dataset, tokenizer, 512)

Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
val_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [5]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)
    
model = BertClassifier(BertModel.from_pretrained('bert-base-uncased'), num_classes=4)
model.to(device)

def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

LEARNING_RATE = 1e-05
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
EPOCHS = 5        
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  1.4177124500274658
Epoch: 1, Loss:  1.180681824684143
Epoch: 2, Loss:  0.9619386196136475
Epoch: 3, Loss:  1.152745246887207
Epoch: 4, Loss:  0.7387503385543823


In [7]:
from sklearn import metrics

def validation():
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(val_dataloader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)  # Ensure this is long for CrossEntropyLoss
            outputs = model(ids, mask, token_type_ids)
            outputs = outputs.logits if hasattr(outputs, 'logits') else outputs  # Ensure this line correctly refers to your model's output
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())  # Use argmax for multi-class

    return fin_outputs, fin_targets

outputs, targets = validation()
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Epoch {epoch}: Accuracy Score = {accuracy}")
print(f"Epoch {epoch}: F1 Score (Micro) = {f1_score_micro}")
print(f"Epoch {epoch}: F1 Score (Macro) = {f1_score_macro}")



Epoch 4: Accuracy Score = 0.3626373626373626
Epoch 4: F1 Score (Micro) = 0.3626373626373626
Epoch 4: F1 Score (Macro) = 0.2727676639517356
