In [24]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

In [26]:
# Fixing the randomness of CUDA.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("PyTorch Version : {}".format(torch.__version__))
print(DEVICE)

PyTorch Version : 2.2.1+cu121
cuda


In [27]:
worksapce = '/content/drive/MyDrive/fasttext/'
model_save = 'TC+TC+CB+FZ.pt'
model_name = 'TC+TC+CB+FZ'
num_epochs = 100
batch_size = 32
learning_rate = 1e-3
num_classes = 6
padding_idx = 0
metadata_each_dim = 10


col = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

label_map = {0: 'pants-fire', 1: 'false', 2: 'barely-true', 3: 'half-true', 4: 'mostly-true', 5: 'true'}
label_convert = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true':5}


In [28]:
train_data = pd.read_csv(worksapce + 'train.tsv', sep = '\t', names = col)
test_data = pd.read_csv(worksapce + 'test.tsv', sep = '\t', names = col)
val_data = pd.read_csv(worksapce + 'valid.tsv', sep = '\t', names = col)

# Replace NaN values with 'NaN'
train_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']] = train_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']].fillna(0)
train_data.fillna('NaN', inplace=True)

test_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']] = test_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']].fillna(0)
test_data.fillna('NaN', inplace=True)

val_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']] = val_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']].fillna(0)
val_data.fillna('NaN', inplace=True)

In [29]:
# val_data = pd.concat([train_data.iloc[0:1], val_data]).reset_index(drop=True)

In [30]:
val_data[0:1]

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,12134.json,barely-true,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,U.S. Representative,Missouri,republican,1,0,1,0,0,an interview with ABC17 News


In [31]:
def textProcess(input_text, max_length = 512):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    if True and max_length == -1:
        tokens = tokenizer(input_text, truncation=True, padding=True)
    else:
        tokens = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length)
    return tokens

In [32]:
# from transformers import XLNetTokenizer

# def textProcess(input_text, max_length=-1):
#   tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')  # Adjust casing based on your needs
#   if max_length == -1:
#     tokens = tokenizer(input_text, truncation=True, padding=True)
#   else:
#     tokens = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length)
#   return tokens

In [33]:
# from transformers import XLNetTokenizer

# def textProcess(input_text, max_length=-1):
#   tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')  # Adjust casing based on your needs
#   if max_length == -1:
#     tokens = tokenizer(input_text, truncation=True, padding=True)
#   else:
#     tokens = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length)
#   return tokens

In [34]:
# Define a custom dataset for loading the data
class LiarDataset(data.Dataset):
    def __init__(self, data_df, statement, label_onehot, label, subject, speaker, job_title, state_info,
                     party_affiliation, barely_true_counts, false_counts, half_true_counts, mostly_true_counts,
                    pants_on_fire_counts, context):
        self.data_df = data_df
        self.statement = statement
        self.label_onehot = label_onehot
        self.label = label
        self.metadata_text = torch.cat((subject.int(), speaker.int(), job_title.int(), state_info.int(), party_affiliation.int(),
                                   context.int()), dim=-1)
        self.metadata_number = torch.cat((torch.tensor(barely_true_counts, dtype=torch.float).unsqueeze(1), torch.tensor(false_counts, dtype=torch.float).unsqueeze(1),
                                   torch.tensor(half_true_counts, dtype=torch.float).unsqueeze(1), torch.tensor(mostly_true_counts, dtype=torch.float).unsqueeze(1),
                                   torch.tensor(pants_on_fire_counts, dtype=torch.float).unsqueeze(1)), dim=-1)



    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        statement = self.statement[idx]
        label_onehot = self.label_onehot[idx]
        label = self.label[idx]
        metadata_text = self.metadata_text[idx]
        metadata_number = self.metadata_number[idx]
        return statement, label_onehot, label, metadata_text, metadata_number

In [35]:
# Define the data loaders for training and validation
train_text = torch.tensor(textProcess(train_data['statement'].tolist())['input_ids'])
train_label = torch.nn.functional.one_hot(torch.tensor(train_data['label'].replace(label_convert)), num_classes=6).type(torch.float64)
train_subject = torch.tensor(textProcess(train_data['subject'].tolist(), metadata_each_dim)['input_ids'])
train_speaker = torch.tensor(textProcess(train_data['speaker'].tolist(), metadata_each_dim)['input_ids'])
train_job_title = torch.tensor(textProcess(train_data['job_title'].tolist(), metadata_each_dim)['input_ids'])
train_state_info = torch.tensor(textProcess(train_data['state_info'].tolist(), metadata_each_dim)['input_ids'])
train_party_affiliation = torch.tensor(textProcess(train_data['party_affiliation'].tolist(), metadata_each_dim)['input_ids'])
train_context = torch.tensor(textProcess(train_data['context'].tolist(), metadata_each_dim)['input_ids'])

train_dataset = LiarDataset(train_data, train_text, train_label, torch.tensor(train_data['label'].replace(label_convert)),
                            train_subject, train_speaker, train_job_title,
                            train_state_info, train_party_affiliation,
                            train_data['barely_true_counts'].tolist(), train_data['false_counts'].tolist(),
                            train_data['half_true_counts'].tolist(), train_data['mostly_true_counts'].tolist(),
                            train_data['pants_on_fire_counts'].tolist(), train_context)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_text = torch.tensor(textProcess(val_data['statement'].tolist())['input_ids'])
val_label = torch.nn.functional.one_hot(torch.tensor(val_data['label'].replace(label_convert)), num_classes=6).type(torch.float64)
val_subject = torch.tensor(textProcess(val_data['subject'].tolist(), metadata_each_dim)['input_ids'])
val_speaker = torch.tensor(textProcess(val_data['speaker'].tolist(), metadata_each_dim)['input_ids'])
val_job_title = torch.tensor(textProcess(val_data['job_title'].tolist(), metadata_each_dim)['input_ids'])
val_state_info = torch.tensor(textProcess(val_data['state_info'].tolist(), metadata_each_dim)['input_ids'])
val_party_affiliation = torch.tensor(textProcess(val_data['party_affiliation'].tolist(), metadata_each_dim)['input_ids'])
val_context = torch.tensor(textProcess(val_data['context'].tolist(), metadata_each_dim)['input_ids'])

val_dataset = LiarDataset(val_data, val_text, val_label, torch.tensor(val_data['label'].replace(label_convert)),
                          val_subject, val_speaker, val_job_title,
                          val_state_info, val_party_affiliation,
                          val_data['barely_true_counts'].tolist(), val_data['false_counts'].tolist(),
                          val_data['half_true_counts'].tolist(), val_data['mostly_true_counts'].tolist(),
                          val_data['pants_on_fire_counts'].tolist(), val_context)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)

test_text = torch.tensor(textProcess(test_data['statement'].tolist())['input_ids'])
test_label = torch.nn.functional.one_hot(torch.tensor(test_data['label'].replace(label_convert)), num_classes=6).type(torch.float64)
test_subject = torch.tensor(textProcess(test_data['subject'].tolist(), metadata_each_dim)['input_ids'])
test_speaker = torch.tensor(textProcess(test_data['speaker'].tolist(), metadata_each_dim)['input_ids'])
test_job_title = torch.tensor(textProcess(test_data['job_title'].tolist(), metadata_each_dim)['input_ids'])
test_state_info = torch.tensor(textProcess(test_data['state_info'].tolist(), metadata_each_dim)['input_ids'])
test_party_affiliation = torch.tensor(textProcess(test_data['party_affiliation'].tolist(), metadata_each_dim)['input_ids'])
test_context = torch.tensor(textProcess(test_data['context'].tolist(), metadata_each_dim)['input_ids'])

test_dataset = LiarDataset(test_data, test_text, test_label, torch.tensor(test_data['label'].replace(label_convert)),
                          test_subject, test_speaker, test_job_title,
                          test_state_info, test_party_affiliation,
                          test_data['barely_true_counts'].tolist(), test_data['false_counts'].tolist(),
                          test_data['half_true_counts'].tolist(), test_data['mostly_true_counts'].tolist(),
                          test_data['pants_on_fire_counts'].tolist(), test_context)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size)

In [36]:
train_data['statement'].to_list()[0]

'Says the Annies List political group supports third-trimester abortions on demand.'

In [37]:
torch.tensor(textProcess(val_data['statement'].tolist())['input_ids']).size()

torch.Size([1284, 512])

In [38]:
torch.tensor(val_data['label'].replace(label_convert))

tensor([2, 0, 1,  ..., 5, 1, 2])

### Forward Forward 2

In [39]:
train_text2 = train_text.float()
test_text2 = test_text.float()
val_text2 = val_text.float()


In [40]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_text2, torch.tensor(train_data['label'].replace(label_convert)))
test_dataset = TensorDataset(test_text2, torch.tensor(test_data['label'].replace(label_convert)))
val_dataset = TensorDataset(val_text2, torch.tensor(val_data['label'].replace(label_convert)))

In [41]:
val_text.size()

torch.Size([1284, 512])

In [42]:
import torch
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

def DATA_loaders(train_batch_size=1000, test_batch_size=10000):

    transform = Compose([
        ToTensor(),
        Normalize((0.1307,), (0.3081,)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = DataLoader(
        train_dataset,
        batch_size=train_batch_size, shuffle=True)

    eval_train_loader = DataLoader(
        val_dataset,
        batch_size=test_batch_size, shuffle=False)

    eval_test_loader = DataLoader(
        test_dataset,
        batch_size=test_batch_size, shuffle=False)

    return train_loader, eval_train_loader, eval_test_loader

def create_data_pos(images, labels):
    return overlay_labels_on_images(images, labels)

def create_data_neg(images, labels):
    labels_neg = labels.clone()
    for idx, y in enumerate(labels):
        all_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        all_labels.pop(y.item()) # remove y from labels to generate negative data
        labels_neg[idx] = torch.tensor(np.random.choice(all_labels)).cpu()
    return overlay_labels_on_images(images, labels_neg)

def overlay_labels_on_images(images, labels):
    """Replace the first 10 pixels of images with one-hot-encoded labels
    """
    num_images = images.shape[0]
    data = images.clone()
    data[:, :10] *= 0.0
    data[range(0,num_images), labels] = images.max()
    return data

def visualize_sample(data, name='', idx=0):
    reshaped = data[idx].cpu().reshape(28, 28)
    plt.figure(figsize = (4, 4))
    plt.title(name)
    plt.imshow(reshaped, cmap="gray")
    plt.show()

In [43]:

import torch
import torch.nn as nn
from tqdm import tqdm
from torch.optim import Adam

class FFNet(torch.nn.Module):

    def __init__(self, dims):
        super().__init__()
        self.num_epochs = 20
        self.layers = []
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = 'cpu'
        for d in range(len(dims) - 1):
            self.layers += [FFLayer(dims[d], dims[d + 1]).cpu()]

    """
    There are two approaches for batch training:
    1. Iterate batches for all layers. ---> easy
    2. Iterate batches for each layer. ---> need to create new batches for next layer input
    We use 1 for the following two training methods.
    """

    def train_1(self, data_loader):
        """
        Train method 1: train all layers for each epoch for each batch.
        """
        for batch_i, (x_batch, y_batch) in enumerate(data_loader):
            print("Training Batch (Size:", str(x_batch.size(dim=0)) + ')', '#', batch_i + 1, '/', len(data_loader))
            batch_pos, batch_neg = create_data_pos(x_batch, y_batch), create_data_neg(x_batch, y_batch)
            batch_pos, batch_neg = batch_pos.to(self.device), batch_neg.to(self.device)
            for epoch in tqdm(range(self.num_epochs)):
                h_batch_pos, h_batch_neg = batch_pos, batch_neg
                for layer_i, layer in enumerate(self.layers):
                    h_batch_pos, h_batch_neg, loss = layer.train(h_batch_pos, h_batch_neg)

            print('train error:', str(round((1.0 - self.predict(eval_train_loader)) * 100, 2)) + '%')
            print('test error:', str(round((1.0 - self.predict(eval_test_loader)) * 100, 2)) + '%')


    def train_2(self, data_loader):
        """
        Train method 2: train all epochs for each layer for each batch.
        """
        for batch_i, (x_batch, y_batch) in enumerate(data_loader):
            batch_loss = 0
            print("Training Batch (Size:", str(x_batch.size(dim=0)) + ')', '#', batch_i + 1, '/', len(data_loader))
            h_batch_pos, h_batch_neg = create_data_pos(x_batch, y_batch), create_data_neg(x_batch, y_batch)
            h_batch_pos, h_batch_neg = h_batch_pos.to(self.device), h_batch_neg.to(self.device)
            for layer_i, layer in enumerate(tqdm(self.layers)):
                for epoch in range(self.num_epochs):
                    h_batch_pos_epoch, h_batch_neg_epoch, loss = layer.train(h_batch_pos, h_batch_neg)
                    batch_loss += loss.item()
                h_batch_pos, h_batch_neg = h_batch_pos_epoch, h_batch_neg_epoch

            print('train error:', str(round((1.0 - self.predict(eval_train_loader)) * 100, 2)) + '%')
            print('test error:', str(round((1.0 - self.predict(eval_test_loader)) * 100, 2)) + '%')

            print('batch {} loss: {}'.format(batch_i + 1, batch_loss))

    def train_3(self, data_loader):
        """
        Train method 3: train all layers for each batch for each epoch. [Slow but better?]
        """
        cached_data = []
        for epoch in tqdm(range(self.num_epochs)):
            epoch_loss = 0
            for batch_i, (x_batch, y_batch) in enumerate(data_loader):
                # print("Training Batch (Size:", str(x_batch.size(dim=0)) + ')', '#', batch_i + 1, '/', len(data_loader))
                if (epoch + 1) == 1:
                    h_batch_pos, h_batch_neg = create_data_pos(x_batch, y_batch), create_data_neg(x_batch, y_batch)
                    h_batch_pos, h_batch_neg = h_batch_pos.to(self.device), h_batch_neg.to(self.device)
                    cached_data.append((h_batch_pos, h_batch_neg))
                else:
                    h_batch_pos, h_batch_neg = cached_data[batch_i]
                for layer_i, layer in enumerate(self.layers):
                    h_batch_pos_epoch, h_batch_neg_epoch, loss = layer.train(h_batch_pos, h_batch_neg)
                    epoch_loss += loss.item()
                    h_batch_pos, h_batch_neg = h_batch_pos_epoch, h_batch_neg_epoch

            print('train error:', str(round((1.0 - self.predict(eval_train_loader)) * 100, 2)) + '%')
            print('test error:', str(round((1.0 - self.predict(eval_test_loader)) * 100, 2)) + '%')

            print('   epoch {} loss: {}'.format(epoch + 1, epoch_loss))

    @torch.no_grad()
    def predict(self, data_loader):
        all_predictions = torch.Tensor([])
        all_labels = torch.Tensor([])
        all_predictions, all_labels = all_predictions.to(self.device), all_labels.to(self.device)
        for batch_i, (x_batch, y_batch) in enumerate(data_loader):
            print("Evaluation Batch (Size:", str(x_batch.size(dim=0)) + ')', '#', batch_i + 1, '/', len(data_loader))
            x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
            goodness_per_label_batch = []
            for label in range(10):
                h_batch = overlay_labels_on_images(x_batch, label)
                goodness_batch = []
                for layer in self.layers:
                    h_batch = layer(h_batch)
                    goodness_batch += [h_batch.pow(2).mean(1)]
                goodness_per_label_batch += [sum(goodness_batch).unsqueeze(1)]
            goodness_per_label_batch = torch.cat(goodness_per_label_batch, 1)
            predictions_batch = goodness_per_label_batch.argmax(1)
            all_predictions = torch.cat((all_predictions, predictions_batch), 0)
            all_labels = torch.cat((all_labels, y_batch), 0)
        return all_predictions.eq(all_labels).float().mean().item()


class FFLayer(nn.Linear):
    def __init__(self, in_features, out_features,
                 bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        self.tanh = torch.nn.Tanh()
        self.leakyrelu = torch.nn.LeakyReLU()
        self.rrelu = torch.nn.RReLU()
        self.gelu = torch.nn.GELU()
        self.opt = torch.optim.AdamW(self.parameters(), lr=0.02)
        self.threshold = 2.0

    def forward(self, x):
        x_direction = x / (x.norm(2, 1, keepdim=True) + 1e-4)
        return self.relu(torch.mm(x_direction, self.weight.T) + self.bias.unsqueeze(0))

    def train(self, x_pos, x_neg):
        g_pos = self.forward(x_pos).pow(2).mean(1)
        g_neg = self.forward(x_neg).pow(2).mean(1)
        # The following loss pushes pos (neg) samples to values larger (smaller) than the self.threshold.
        loss = torch.log(1 + torch.exp(torch.cat([-g_pos + self.threshold, g_neg - self.threshold]))).mean()
        self.opt.zero_grad()
        # this backward just compute the derivative and hence is not considered backpropagation.
        loss.backward()
        self.opt.step()
        return self.forward(x_pos).detach(), self.forward(x_neg).detach(), loss.detach()


In [44]:
torch.manual_seed(42)
import time
train_loader, eval_train_loader, eval_test_loader = DATA_loaders()

net = FFNet([512, 2000, 2000, 2000, 2000, 6])

time_training_start = time.time()
net.train_3(train_loader)
# net.train_2(train_loader)
# net.train_1(train_loader)
time_training_end = time.time()
training_time = round(time_training_end - time_training_start, 2)

print(f"Training time: {training_time}s")

print('train error:', str(round((1.0 - net.predict(eval_train_loader)) * 100, 2)) + '%')

print('test error:', str(round((1.0 - net.predict(eval_test_loader)) * 100, 2)) + '%')

  0%|          | 0/20 [00:00<?, ?it/s]

Evaluation Batch (Size: 1284) # 1 / 1
train error: 79.52%
Evaluation Batch (Size: 1267) # 1 / 1


  5%|▌         | 1/20 [00:40<12:57, 40.93s/it]

test error: 80.35%
   epoch 1 loss: 47.97023546695709


  5%|▌         | 1/20 [00:43<13:47, 43.58s/it]


KeyboardInterrupt: 