In [1]:
!pip install torchmetrics
!pip install gdown

Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl.metadata (5.7 kB)
Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
Successfully installed gdown-5.1.0


In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertModel, AdamW, get_constant_schedule_with_warmup
import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
from torchmetrics.functional import f1_score, accuracy
from tqdm import tqdm
import pickle

In [3]:
import gdown

gdown.download("https://drive.google.com/file/d/1k5LMwmYF7PF-BzYQNE2ULBae79nbM268/view?usp=drive_link", "subtaskB_train.jsonl", quiet=False, fuzzy=True)
gdown.download("https://drive.google.com/file/d/1oh9c-d0fo3NtETNySmCNLUc6H1j4dSWE/view?usp=drive_link", "subtaskB_dev.jsonl", quiet=False, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1k5LMwmYF7PF-BzYQNE2ULBae79nbM268
From (redirected): https://drive.google.com/uc?id=1k5LMwmYF7PF-BzYQNE2ULBae79nbM268&confirm=t&uuid=28b484ae-a018-426b-92d9-0440783bbb8d
To: /kaggle/working/subtaskB_train.jsonl
100%|██████████| 155M/155M [00:00<00:00, 208MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oh9c-d0fo3NtETNySmCNLUc6H1j4dSWE
To: /kaggle/working/subtaskB_dev.jsonl
100%|██████████| 4.93M/4.93M [00:00<00:00, 124MB/s]


'subtaskB_dev.jsonl'

In [4]:
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
max_length = 128
epoch_nums = 3
lr = 1e-4
epsilon = 1e-8
splits = [0.01, 0.05, 0.1, 0.5]

train_path = 'subtaskB_train.jsonl'
val_path = 'subtaskB_dev.jsonl'

discriminator_save_path = 'discriminator.pth'
bert_save_path = 'bert.pth'
report_path = 'report_Bert_adapter.csv'

In [5]:
train_data = pd.read_json(train_path,lines=True)
val_data = pd.read_json(val_path, lines=True)

label_dict = {'chatGPT':0, 'human':1, 'cohere':2, 'davinci':3, 'bloomz':4, 'dolly':5}
label2int = lambda label: label_dict[label]

train_text = list(train_data['text'])
label_train = list(train_data['model'].apply(label2int))
text_val= list(val_data['text'])
label_val = list(val_data['model'].apply(label2int))

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
splits = [0.01, 0.05, 0.1, 0.5]
train_datasets = []
for split in splits:
    labeled_text, _, label, _  = train_test_split(train_text,label_train,test_size=1-split)
    label = torch.LongTensor(label)
    tokenized_labeled_text = tokenizer(labeled_text, max_length=max_length, truncation=True, padding='max_length',return_tensors='pt')

    tokenized_text = {'input_ids':tokenized_labeled_text['input_ids'],
                      'attention_mask': tokenized_labeled_text['attention_mask'],
                      'label': label}

    train_dataset = TensorDataset(tokenized_text['input_ids'],tokenized_text['attention_mask'], tokenized_text['label'])
    train_datasets.append(train_dataset)
    print(f"train dataset for split {split} added.")

with open('train_datasets.pkl','wb') as f:
     pickle.dump(train_datasets,f)

tokenized_text = tokenizer(text_val, max_length=max_length, truncation=True, padding='max_length',return_tensors='pt')
val_dataset = TensorDataset(tokenized_text['input_ids'], tokenized_text['attention_mask'], torch.LongTensor(label_val))
with open('val_dataset.pkl','wb') as f:
     pickle.dump(val_dataset,f)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

train dataset for split 0.01 added.
train dataset for split 0.05 added.
train dataset for split 0.1 added.
train dataset for split 0.5 added.


In [7]:
trainLoaders = []
for train_dataset in train_datasets:
    trainLoaders.append(DataLoader(train_dataset,batch_size=batch_size,shuffle=True))

valLoader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [8]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.feat = nn.Sequential(nn.Dropout(p=0.2), nn.Linear(768,768), nn.LeakyReLU(), nn.Dropout(p=0.2))
        self.logit = nn.Linear(768,6)

    def forward(self, x):
        feat = self.feat(x)
        logit = self.logit(feat)
        return feat, logit

class Bert(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, att_mask):
        return self.model(input_ids, att_mask)[0][:,0,:]

In [9]:
def validation(bert, discriminator, valLoader):
    with torch.no_grad():
        bert.eval()
        discriminator.eval()
        all_prediction = []
        all_targets = []
        for i, batch in tqdm(enumerate(valLoader), total=len(valLoader), desc=f'Validation'):

            input_ids = batch[0].cuda()
            att_mask = batch[1].cuda()
            targets = batch[2].cuda()

            y_bert = bert(input_ids, att_mask)
            logits = discriminator(y_bert)[1]

            preds = logits.max(dim=-1)[1]
            all_prediction.append(preds.cpu())
            all_targets.append(targets.cpu())


    return f1_score(preds, targets, 'multiclass', num_classes=6), accuracy(preds, targets, 'multiclass', num_classes=6)

In [10]:
f1s = []
accs = []

for split, trainLoader in zip(splits,trainLoaders):
    
    discriminator = Discriminator().cuda()
    bert = Bert().cuda()
    
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = AdamW(list(discriminator.parameters()) + list(bert.parameters()), lr=lr)
    
    num_train_steps = int(len(trainLoader) * epoch_nums)
    num_warmup_steps = int(num_train_steps * 0.1)
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps)

    for epoch in range(epoch_nums):
        discriminator.train()
        bert.train()

        current_loss = 0.0
        for i, batch in tqdm(enumerate(trainLoader), total=len(trainLoader), desc=f'({split}) epoch {epoch}'):
            
            input_ids = batch[0].cuda()
            att_mask = batch[1].cuda()
            targets = batch[2].cuda()

            y_bert = bert(input_ids, att_mask)
            logits = discriminator(y_bert)[1]

            loss = criterion(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            current_loss += loss.item()

            scheduler.step()

        print(f'Loss: {current_loss / len(trainLoader)}')
        f1, acc = validation(bert, discriminator, valLoader)
        print(f'f1 score: {f1.item()}, accuracy: {acc.item()}')

        torch.save(discriminator.state_dict(), f'split_{split}_'+discriminator_save_path)
        torch.save(bert.state_dict(), f'split_{split}_'+bert_save_path)

    f1s.append(f1.item())
    accs.append(acc.item())

report = pd.DataFrame({"splits": splits, "accuracies": accs, "f1 score": f1s})
report.to_csv(report_path)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(0.01) epoch 0: 100%|██████████| 23/23 [00:08<00:00,  2.65it/s]


Loss: 1.7804012816885244


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.93it/s]


f1 score: 0.2083333283662796, accuracy: 0.2083333283662796


(0.01) epoch 1: 100%|██████████| 23/23 [00:07<00:00,  2.90it/s]


Loss: 1.559231110241102


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.92it/s]


f1 score: 0.125, accuracy: 0.125


(0.01) epoch 2: 100%|██████████| 23/23 [00:07<00:00,  2.89it/s]


Loss: 1.0457517504692078


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.92it/s]


f1 score: 0.3333333432674408, accuracy: 0.3333333432674408


(0.05) epoch 0: 100%|██████████| 111/111 [00:39<00:00,  2.80it/s]


Loss: 1.4168394342199102


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.88it/s]


f1 score: 0.375, accuracy: 0.375


(0.05) epoch 1: 100%|██████████| 111/111 [00:39<00:00,  2.80it/s]


Loss: 0.755742737301835


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.88it/s]


f1 score: 0.4166666567325592, accuracy: 0.4166666567325592


(0.05) epoch 2: 100%|██████████| 111/111 [00:39<00:00,  2.80it/s]


Loss: 0.40767489306561583


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.89it/s]


f1 score: 0.625, accuracy: 0.625


(0.1) epoch 0: 100%|██████████| 222/222 [01:19<00:00,  2.80it/s]


Loss: 1.2197091727106422


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.90it/s]


f1 score: 0.4583333432674408, accuracy: 0.4583333432674408


(0.1) epoch 1: 100%|██████████| 222/222 [01:19<00:00,  2.80it/s]


Loss: 0.6378584580646979


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.89it/s]


f1 score: 0.625, accuracy: 0.625


(0.1) epoch 2: 100%|██████████| 222/222 [01:19<00:00,  2.80it/s]


Loss: 0.36101661308727284


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.92it/s]


f1 score: 0.5, accuracy: 0.5


(0.5) epoch 0: 100%|██████████| 1110/1110 [06:36<00:00,  2.80it/s]


Loss: 0.8304448619484901


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.91it/s]


f1 score: 0.625, accuracy: 0.625


(0.5) epoch 1: 100%|██████████| 1110/1110 [06:36<00:00,  2.80it/s]


Loss: 0.4311705067787353


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.91it/s]


f1 score: 0.375, accuracy: 0.375


(0.5) epoch 2: 100%|██████████| 1110/1110 [06:36<00:00,  2.80it/s]


Loss: 0.27294546630535577


Validation: 100%|██████████| 94/94 [00:10<00:00,  8.90it/s]


f1 score: 0.25, accuracy: 0.25
