In [1]:
!pip install transformers torch pandas



In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from torch.nn import BCEWithLogitsLoss

In [3]:
class BertPreprocessor:
    def __init__(self, model_name='bert-base-uncased', max_length=256):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.max_length = max_length

    def preprocess(self, texts):
        encoding = self.tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            max_length=self.max_length,
            return_attention_mask=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return encoding['input_ids'], encoding['attention_mask']

In [4]:
class BertModel:
    def __init__(self, model_name='bert-base-uncased'):
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def train(self, input_ids, attention_masks, labels, batch_size=32, epochs=4):
        dataset = TensorDataset(input_ids, attention_masks, labels)
        train_size = int(0.9 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        optimizer = AdamW(self.model.parameters(), lr=2e-5)
        loss_fct = BCEWithLogitsLoss()

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            for batch in train_dataloader:
                b_input_ids, b_input_mask, b_labels = [t.to(self.device) for t in batch]
                self.model.zero_grad()

                outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                logits = outputs.logits

                b_labels = b_labels.unsqueeze(1)
                b_labels = torch.cat((1 - b_labels, b_labels), dim=1)

                loss = loss_fct(logits, b_labels.float())
                total_loss += loss.item()
                loss.backward()
                optimizer.step()

            avg_train_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss:.2f}")

            self.model.eval()
            total_eval_accuracy = 0
            for batch in val_dataloader:
                b_input_ids, b_input_mask, b_labels = [t.to(self.device) for t in batch]
                with torch.no_grad():
                    outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions = np.argmax(logits, axis=1)
                total_eval_accuracy += accuracy_score(label_ids, predictions)

            avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
            print(f"Validation Accuracy: {avg_val_accuracy:.2f}")

    def evaluate(self, input_ids, attention_masks, labels, batch_size=32):
        test_dataset = TensorDataset(input_ids, attention_masks, labels)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        self.model.eval()
        total_eval_accuracy = 0
        predictions, true_labels = [], []

        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = [t.to(self.device) for t in batch]
            with torch.no_grad():
                outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            batch_predictions = np.argmax(logits, axis=1)
            predictions.extend(batch_predictions)
            true_labels.extend(label_ids)

        print("Accuracy:", accuracy_score(true_labels, predictions))
        print("Classification Report:")
        print(classification_report(true_labels, predictions))

In [5]:
from google.colab import files
uploaded = files.upload()

Saving test_data.tsv to test_data.tsv
Saving train_data.tsv to train_data.tsv
Saving Validation_data.tsv to Validation_data.tsv


In [6]:
import pandas as pd

def main():

    df1 = pd.read_csv('train_data.tsv', sep='\t')
    df2 = pd.read_csv('test_data.tsv', sep='\t')
    df3 = pd.read_csv('Validation_data.tsv', sep='\t')

    columns = ['index', 'id', 'label', 'statement', 'subject', 'speaker', 'JobTitle', 'State', 'Party', 'BTC', 'FC', 'HT', 'MT', 'POF', 'context', 'justification']
    df1.columns = columns
    df2.columns = columns
    df3.columns = columns

    df = pd.concat([df1, df2, df3])

    label_mapping = {
        'true': 1, 'half-true': 1, 'mostly-true': 1,
        'false': 0, 'pants-fire': 0, 'barely-true': 0
    }
    df['label'] = df['label'].map(label_mapping)

    print("NaNs after mapping:", df['label'].isnull().sum())

    # Preprocess the text data for BERT model
    bert_preprocessor = BertPreprocessor()
    texts = df['statement'].tolist()
    input_ids, attention_masks = bert_preprocessor.preprocess(texts)

    y_bert = torch.tensor(df['label'].values)

    # Train and evaluate the BERT model
    bert_model = BertModel()
    bert_model.train(input_ids, attention_masks, y_bert)
    bert_model.evaluate(input_ids, attention_masks, y_bert)

main()

NaNs after mapping: 0


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4 - Loss: 0.66
Validation Accuracy: 0.64
Epoch 2/4 - Loss: 0.62
Validation Accuracy: 0.64
Epoch 3/4 - Loss: 0.52
Validation Accuracy: 0.63
Epoch 4/4 - Loss: 0.32
Validation Accuracy: 0.63
Accuracy: 0.9285267438223335
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.89      0.92      5655
           1       0.92      0.96      0.94      7133

    accuracy                           0.93     12788
   macro avg       0.93      0.92      0.93     12788
weighted avg       0.93      0.93      0.93     12788

