In [1]:
!pip install transformers torch spacy sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, MarianMTModel, MarianTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from torch.nn import BCEWithLogitsLoss
import spacy
import random
import pandas as pd
import torch.nn as nn

In [3]:
class BertPreprocessor:
    def __init__(self, model_name='bert-base-uncased', max_length=256):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.max_length = max_length

    def preprocess(self, texts):
        encoding = self.tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            max_length=self.max_length,
            return_attention_mask=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return encoding['input_ids'], encoding['attention_mask']

In [4]:
class EntityEmbedder:
    def __init__(self, glove_file_path):
        self.embeddings_index = {}
        self.load_glove_embeddings(glove_file_path)

    def load_glove_embeddings(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
        print(f"Loaded {len(self.embeddings_index)} word vectors.")

    def get_embedding(self, word):
        return self.embeddings_index.get(word, np.zeros(50))

    def create_embedding_matrix(self, unique_entities):
        embedding_matrix = np.zeros((len(unique_entities), 50))
        for i, entity in enumerate(unique_entities):
            embedding_matrix[i] = self.get_embedding(entity)
        return embedding_matrix

In [5]:
class BackTranslationAugmentor:
    def __init__(self, src_lang="en", target_lang="fr"):
        self.src_lang = src_lang
        self.target_lang = target_lang
        self.src_model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{target_lang}'
        self.target_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{src_lang}'
        self.src_tokenizer = MarianTokenizer.from_pretrained(self.src_model_name)
        self.target_tokenizer = MarianTokenizer.from_pretrained(self.target_model_name)
        self.src_model = MarianMTModel.from_pretrained(self.src_model_name)
        self.target_model = MarianMTModel.from_pretrained(self.target_model_name)

    def translate(self, texts, model, tokenizer):
        # Ensure texts is a list
        if isinstance(texts, str):
            texts = [texts]
        # Tokenize the texts and handle potential long sequences
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        translated = model.generate(**inputs)
        return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    def back_translate(self, text):
        translated_text = self.translate(text, self.src_model, self.src_tokenizer)
        back_translated_text = self.translate(translated_text, self.target_model, self.target_tokenizer)
        return back_translated_text[0]

In [6]:
class BertModel:
    def __init__(self, model_name='bert-base-uncased', entity_embedding_matrix=None):
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, output_hidden_states=True)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        if entity_embedding_matrix is not None:
            self.entity_embedding_matrix = torch.tensor(entity_embedding_matrix, dtype=torch.float).to(self.device)
            self.entity_embedding_layer = nn.Embedding.from_pretrained(self.entity_embedding_matrix)

            bert_output_size = 768
            entity_embedding_size = entity_embedding_matrix.shape[1]
            self.concatenated_layer_size = bert_output_size + entity_embedding_size

            self.classifier = nn.Linear(self.concatenated_layer_size, 2).to(self.device)

    def train(self, input_ids, attention_masks, labels, entity_indices, batch_size=16, epochs=4):
        dataset = TensorDataset(input_ids, attention_masks, labels, entity_indices)
        train_size = int(0.9 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        optimizer = AdamW(self.model.parameters(), lr=2e-5)
        loss_fct = BCEWithLogitsLoss()

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            for batch in train_dataloader:
                b_input_ids, b_input_mask, b_labels, b_entity_indices = [t.to(self.device) for t in batch]
                self.model.zero_grad()

                outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                bert_output = outputs.hidden_states[-1][:, 0, :]

                entity_embeddings = self.entity_embedding_layer(b_entity_indices)
                entity_embeddings = torch.max(entity_embeddings, dim=1).values

                concatenated_output = torch.cat((bert_output, entity_embeddings), dim=1)
                logits = self.classifier(concatenated_output)

                b_labels = b_labels.unsqueeze(1)
                b_labels = torch.cat((1 - b_labels, b_labels), dim=1)

                loss = loss_fct(logits, b_labels.float())
                total_loss += loss.item()
                loss.backward()
                optimizer.step()

            avg_train_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss:.2f}")

            self.model.eval()
            total_eval_accuracy = 0
            for batch in val_dataloader:
                b_input_ids, b_input_mask, b_labels, b_entity_indices = [t.to(self.device) for t in batch]
                with torch.no_grad():
                    outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                    bert_output = outputs.hidden_states[-1][:, 0, :]

                    entity_embeddings = self.entity_embedding_layer(b_entity_indices)
                    entity_embeddings = torch.max(entity_embeddings, dim=1).values

                    concatenated_output = torch.cat((bert_output, entity_embeddings), dim=1)
                    logits = self.classifier(concatenated_output)

                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.to('cpu').numpy()

                    predictions = np.argmax(logits, axis=1)
                    total_eval_accuracy += accuracy_score(label_ids, predictions)

            avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
            print(f"Validation Accuracy: {avg_val_accuracy:.2f}")


    def evaluate(self, input_ids, attention_masks, labels, entity_indices, batch_size=16):
        test_dataset = TensorDataset(input_ids, attention_masks, labels, entity_indices)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        self.model.eval()
        total_eval_accuracy = 0
        predictions, true_labels = [], []

        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels, b_entity_indices = [t.to(self.device) for t in batch]
            with torch.no_grad():
                outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                bert_output = outputs.hidden_states[-1][:, 0, :]  # Use the last hidden state of the [CLS] token

                entity_embeddings = self.entity_embedding_layer(b_entity_indices)
                entity_embeddings = torch.max(entity_embeddings, dim=1).values  # Or use another pooling method

                concatenated_output = torch.cat((bert_output, entity_embeddings), dim=1)
                logits = self.classifier(concatenated_output)

                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                batch_predictions = np.argmax(logits, axis=1)
                predictions.extend(batch_predictions)
                true_labels.extend(label_ids)

        print("Accuracy:", accuracy_score(true_labels, predictions))
        print("Classification Report:")
        print(classification_report(true_labels, predictions))

In [7]:
from google.colab import files
uploaded = files.upload()


Saving train_data.tsv to train_data.tsv
Saving glove.6B.50d.txt to glove.6B.50d.txt
Saving test_data.tsv to test_data.tsv
Saving Validation_data.tsv to Validation_data.tsv


In [8]:
def main():
    # Load the datasets
    df_train = pd.read_csv('train_data.tsv', sep='\t')
    df_test = pd.read_csv('test_data.tsv', sep='\t')
    df_validate = pd.read_csv('Validation_data.tsv', sep='\t')

    # Define the column names
    columns = ['index', 'id', 'label', 'statement', 'subject', 'speaker', 'JobTitle', 'State', 'Party', 'BTC', 'FC', 'HT', 'MT', 'POF', 'context', 'justification']
    df_train.columns = columns
    df_test.columns = columns
    df_validate.columns = columns

    # Map labels to binary values for each dataset
    label_mapping = {'true': 1, 'half-true': 1, 'mostly-true': 1, 'false': 0, 'pants-fire': 0, 'barely-true': 0}
    df_train['label'] = df_train['label'].map(label_mapping)
    df_test['label'] = df_test['label'].map(label_mapping)
    df_validate['label'] = df_validate['label'].map(label_mapping)

    # Text augmentation for the training dataset
    augmentor = BackTranslationAugmentor()
    augmentation_indices = random.sample(range(len(df_train)), int(len(df_train) * 0.1))
    augmented_rows = []
    for idx in augmentation_indices:
        original_text = df_train.loc[idx, 'statement']
        augmented_text = augmentor.back_translate(original_text)
        augmented_rows.append({'statement': augmented_text, 'label': df_train.loc[idx, 'label']})

    df_augmented = pd.DataFrame(augmented_rows)
    df_train_augmented = pd.concat([df_train, df_augmented], ignore_index=True)

    # Entity Embedding for all datasets
    entity_embedder = EntityEmbedder('glove.6B.50d.txt')
    unique_entities = set(df_train_augmented['speaker'].tolist() + df_test['speaker'].tolist() + df_validate['speaker'].tolist() +
                          df_train_augmented['JobTitle'].tolist() + df_test['JobTitle'].tolist() + df_validate['JobTitle'].tolist() +
                          df_train_augmented['subject'].tolist() + df_test['subject'].tolist() + df_validate['subject'].tolist() +
                          df_train_augmented['Party'].tolist() + df_test['Party'].tolist() + df_validate['Party'].tolist())
    embedding_matrix = entity_embedder.create_embedding_matrix(list(unique_entities))

    # Generate entity indices for each dataset
    entity_to_index = {entity: i for i, entity in enumerate(unique_entities)}

    def generate_entity_indices(df):
        entity_indices = []
        for _, row in df.iterrows():
            indices = [entity_to_index.get(row[col], 0) for col in ['speaker', 'JobTitle', 'subject', 'Party']]
            entity_indices.append(indices)
        return torch.tensor(entity_indices)

    # Preprocess text data for each dataset using BERT preprocessor
    bert_preprocessor = BertPreprocessor()

    def preprocess_dataset(df):
        texts = df['statement'].tolist()
        input_ids, attention_masks = bert_preprocessor.preprocess(texts)
        y_bert = torch.tensor(df['label'].values)
        entity_indices = generate_entity_indices(df)
        return input_ids, attention_masks, y_bert, entity_indices

    # Prepare datasets
    input_ids_train, attention_masks_train, y_bert_train, entity_indices_train = preprocess_dataset(df_train_augmented)
    input_ids_test, attention_masks_test, y_bert_test, entity_indices_test = preprocess_dataset(df_test)
    input_ids_validate, attention_masks_validate, y_bert_validate, entity_indices_validate = preprocess_dataset(df_validate)

    # Initialize and train the BERT model with entity embeddings
    bert_model = BertModel('bert-base-uncased', embedding_matrix)
    bert_model.train(input_ids_train, attention_masks_train, y_bert_train, entity_indices_train)
    print("Finished Training")

    # Evaluate on Test Dataset
    print("Testing")
    bert_model.evaluate(input_ids_test, attention_masks_test, y_bert_test, entity_indices_test)

    # Evaluate on Validation Dataset
    print("Validation")
    bert_model.evaluate(input_ids_validate, attention_masks_validate, y_bert_validate, entity_indices_validate)

    # Evaluate on Training Dataset
    print("Training Test")
    bert_model.evaluate(input_ids_train, attention_masks_train, y_bert_train, entity_indices_train)

main()


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Loaded 400000 word vectors.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4 - Loss: 0.65
Validation Accuracy: 0.61
Epoch 2/4 - Loss: 0.57
Validation Accuracy: 0.66
Epoch 3/4 - Loss: 0.35
Validation Accuracy: 0.69
Epoch 4/4 - Loss: 0.15
Validation Accuracy: 0.67
Finished Training
Testing
Accuracy: 0.8507109004739336
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       553
           1       0.84      0.91      0.87       713

    accuracy                           0.85      1266
   macro avg       0.85      0.84      0.85      1266
weighted avg       0.85      0.85      0.85      1266

Validation
Accuracy: 0.8456742010911925
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.78      0.83       615
           1       0.82      0.91      0.86       668

    accuracy                           0.85      1283
   macro avg       0.85      0.84      0.84      1283
weighted avg       0.85      0.85      0.84      1283

Training 