## GPT-2 and Bert finetuning comparisions


Aaron Semones



In [1]:
import numpy 
import os 
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import io
import transformers
from tqdm import tqdm
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer, AutoTokenizer,  AutoModelForSequenceClassification



Importing libaries 


In [2]:
 # formatting function for phishing dataset
def remove_doublequotes(file_dir): # 
    raw_file_str = ''
    with open(file_dir, 'r', encoding='utf-8') as f:
        raw_file_str = f.read().replace('""', '"')
    with open(file_dir, 'w', encoding='utf-8') as f:
        f.write(raw_file_str)
        
#partition a dictionary
def split_dict (dict1, index):
    dict1c = dict1
    dict1 = dict(list(dict1c.items())[index:])
    dict2 = dict(list(dict1c.items())[:index])
    return dict1, dict2
        
        


Misc. helper functions

In [3]:
device = torch.device("cuda") #sets your device. Code will run at a snails pace without gpu



class Datasets(Dataset):
    def __init__ (self, testpath=None, Emails = None, size =None, final_data = None , data_processed=False): 
        '''testpath is the path of your csv file, emails is a bool for handling oddities with phising ds
        size is the desired num_entries, final_data is for turning already pre-processed data into a dataset for loading (ie validiation set),
        dataprocessed is the bool controlling that functionalty
        
        
        Members:
        
        length is number of examples
        test_set is a dict of the entire set
        set_labels is a list of the labels
        set_text is a list of the text
        
        
        '''
        if (data_processed):
            self.length = len(final_data)
            self.test_set = final_data
            self.set_labels = [self.test_set[x] for x in self.test_set]
            self.set_text = list (self.test_set.keys())
            return
        
        if Emails == False:
            remove_doublequotes(testpath)
        
        
        if (size):
            self.test_set = pd.read_csv(testpath, nrows = size)
        else: 
            self.test_set = pd.read_csv(testpath)
 
        if (Emails == True):
            self.test_set = self.test_set.set_index('text')['label'].to_dict()
        
        else:
            self.test_set['Unnamed: 0'] = self.test_set ['Unnamed: 0'].apply(lambda x : 2 - x)
            self.test_set =  self.test_set.set_index('text_label')['Unnamed: 0'].to_dict() 
            
        self.length = len (self.test_set)
        
        self.set_labels = [self.test_set[x] for x in self.test_set]
        self.set_text = list (self.test_set.keys())
        
        return
    #homework 3 inspired validation data spilt function
    def split(self, ratio = .8 ):
        index = int(ratio*self.length)
        
        split,self.test_set = split_dict(self.test_set, index)
        
        self.set_labels = self.set_labels[:index]
        self.set_text = self.set_text[:index]
        self.length = len(self.test_set)
        
        return split
    #functions required by pytorch for handling data
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        return {'text': self.set_text[index], 'label': self.set_labels[index]}
    


    


Primary dataset classes for handling data and loading it into pytorch. Code written to be reuseable, allowing for easy testing of different models
 


In [4]:
''' tokenizer class to allow easy swapping of tokenizers
params: use_tokenizer is the tokenizer function of choice
max: the maxiumum sequence length. tokenizer truncates based off this'''

class _tokenize(object):
    def __init__(self,  use_tokenizer, max=512):
         self.use_tokenizer = use_tokenizer
         self.max_sequence_len =max
    #basically just calls the tokenizer, returning embeddings  dict    
    def __call__(self, data):
        text= [x['text'] for x in data]
        label = [x ['label'] for x in data]
        
        
        embeddings = self.use_tokenizer(text=text, return_tensors = "pt", padding = True, truncation= True, max_length = self.max_sequence_len)
        embeddings.update({'labels' : torch.tensor(label)})
        return embeddings
#helper to solve for accuracy, true pos, true neg, false pos, false neg
# takes in the actual labels and a series of predictions
#outputs array of stats
def calculate_stats(labels, predictions):
    acc = 0.0
    fp =0.0
    fn = 0.0
    tp =0.0
    tn = 0.0
    size = len(labels)
    counter = 0
    
    for x in labels:
        if x == 1 and predictions[counter] == 1:
            tp+=1
            acc +=1
        elif x == 0 and predictions[counter] == 0:
            tn+=1
            acc+=1
        elif x == 1:
            fp  +=1
        elif x == 0:
            fn +=1 
        counter +=1
            
    return [acc/size, tp/size, tn/size, fp/size, fn/size]
#evaluates the model 
def test (model, data, device, ):
    print ("Evaluating")
    
    total_loss = 0.0
    predictions = []
    labels= []
   
    model.eval()
    for batch in tqdm (data, total=len(data)):
        labels += batch['labels'].numpy().flatten().tolist()

        batch = {i:j.type(torch.long).to(device) for i,j in batch.items()}
        with torch.no_grad():

           
            model_out = model(**batch)
            loss,logits =model_out[:2]
            total_loss += loss.item()
        
            logits = logits.detach().cpu().numpy() 
        
            predictions  += logits.argmax(axis = -1).flatten().tolist()
    total_loss = total_loss/len(data)
    
  
    stats= calculate_stats (labels, predictions)        
       
    return total_loss, stats

Tokenizer class and helpers for our training loop + eval function

In [5]:
'''Training loop  
Params: model, data (your training data), valid (your valid data), optimizer, scheduler, device, epochs'''



def train(model, data, valid, optimizer, scheduler, device, epochs=1):
    
  
    avg_loss_per_epoch = list ()
    acc_t = []
    tp_t = []
    tn_t = []
    fp_t = []
    fn_t = []
    
    acc_v = []
    tp_v = []
    tn_v = []
    fp_v = []
    fn_v = []
    
    v_loss_t = list ()
    for i in range(epochs):
        print ("Training " ,i, " Epoch")
        total_loss = 0
        predictions = []
        labels = []
        v_loss = []
        model.train()
        for batch in tqdm (data, total=len(data)):
            labels += batch['labels'].numpy().flatten().tolist()
            batch = {i:j.type(torch.long).to(device) for i,j in batch.items()}

            model.zero_grad()
        
        
            model_out = model(**batch)
        
            loss_obj, logits = model_out[:2]
    
        
            total_loss += loss_obj.item()
        
            loss_obj.backward()
        
        
        
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
            optimizer.step()
            scheduler.step()
            
            
            logits = logits.detach().cpu().numpy()
        
            predictions  += logits.argmax(axis = -1).flatten().tolist()
  
        stats= calculate_stats (labels, predictions)
        
        avg_loss_per_epoch.append(total_loss/len(data))
        
        print ("training accuracy for epoch ", i ,": ", stats[0])
        print ("training loss for epoch ", i ,": ",total_loss/len(data))
        acc_t.append(stats[0])
        tp_t.append(stats[1])
        tn_t.append(stats[2])
        fp_t.append(stats[3])
        fn_t.append(stats[4])
        
  
        v_loss, stats_v = test(model, valid, device)
        
        
        
        acc_v.append(stats_v[0])
        tp_v.append(stats_v[1])
        tn_v.append(stats_v[2])
        fp_v.append(stats_v[3])
        fn_v.append(stats_v[4])
        
        v_loss_t.append(v_loss)
        
        print ("eval accuracy for epoch ", i ,": ",stats_v[0])
        print ("eval loss for epoch ", i ,": ",v_loss)
      
    t_stats = [avg_loss_per_epoch, acc_t, tp_t, tn_t, fp_t, fn_t]
    v_stats = [v_loss_t, acc_v, tp_v, tn_v, fp_v, fn_v]
    return  t_stats, v_stats

        


    

training loop


In [6]:
model = 'distilgpt2'
config = GPT2Config.from_pretrained(pretrained_model_name_or_path = model, num_labels =2, fp16 = True, num_workers = 4)
tokenzier = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path = model)
tokenzier.padding_side = "left"
tokenzier.pad_token = '50256'
gpt2 = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path = model, config = config )
gpt2.resize_token_embeddings(len(tokenzier))
gpt2.config.pad_token_id = gpt2.config.eos_token_id
#gpt2.to(device)

'''tokenzier = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2 = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

gpt2.to(device)
print (device,"being_used")
'''

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'tokenzier = AutoTokenizer.from_pretrained("bert-base-uncased")\ngpt2 = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)\n\ngpt2.to(device)\nprint (device,"being_used")\n'

Inializing our first model, gpt2. gpt2 is a decoder based transformer, which makes it superior at text generation, but generally worse at classification tasks. I used distilgpt2, as gpt2-medium was 500mb too large for my gpus memory. Feel free to go up in complexity if you can run it

In [7]:
baby_spam_test = Datasets(testpath='C:/Users/asemo/CSCI 3832/datasets/SpamHam/test.csv' ,Emails=True, size = 2000)
baby_spam_train = Datasets(testpath='C:/Users/asemo/CSCI 3832/datasets/SpamHam/train.csv' ,Emails=True, size = 2000)
baby_spam_valid = Datasets(final_data = baby_spam_train.split(), data_processed=True)
gpt_tokenizer = _tokenize(tokenzier)

baby_spam_train_dataloader = DataLoader(baby_spam_train, batch_size= 8, shuffle=True, collate_fn = gpt_tokenizer)
baby_spam_valid_dataloader = DataLoader(baby_spam_valid, batch_size=8, shuffle=False, collate_fn = gpt_tokenizer)
baby_spam_test_dataloader =  DataLoader(baby_spam_test, batch_size= 8, shuffle=False, collate_fn= gpt_tokenizer)

using a baby dataset for the inital comparisions between the models. Batch size is 'only' 8, as that was the highest I could go without graident checkpointing.

In [8]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_spam_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(gpt2, baby_spam_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(gpt2, baby_spam_test_dataloader, device)

Training  0  Epoch


100%|██████████| 200/200 [00:57<00:00,  3.49it/s]


training accuracy for epoch  0 :  0.690625
training loss for epoch  0 :  0.5926605705916882
Evaluating


100%|██████████| 50/50 [00:05<00:00,  9.51it/s]


eval accuracy for epoch  0 :  0.765
eval loss for epoch  0 :  0.5058020314574242
Training  1  Epoch


100%|██████████| 200/200 [00:57<00:00,  3.50it/s]


training accuracy for epoch  1 :  0.766875
training loss for epoch  1 :  0.49623864896595477
Evaluating


100%|██████████| 50/50 [00:05<00:00,  9.48it/s]


eval accuracy for epoch  1 :  0.765
eval loss for epoch  1 :  0.5058020314574242
Training  2  Epoch


100%|██████████| 200/200 [00:58<00:00,  3.42it/s]


training accuracy for epoch  2 :  0.768125
training loss for epoch  2 :  0.49269786052405834
Evaluating


100%|██████████| 50/50 [00:05<00:00,  9.51it/s]


eval accuracy for epoch  2 :  0.765
eval loss for epoch  2 :  0.5058020314574242
Evaluating


100%|██████████| 250/250 [00:25<00:00,  9.70it/s]


training printouts. eval is broken for some reason, only records the first evaluation and loss. Final printouts from the test set are below 

In [9]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

acc: 0.7575
True pos 0.266
True neg 0.4915
False pos 0.2085
False neg 0.034


GPT2 delivers a respectable test set average on the babyset. 75% accuracy with a 20% false positive rate


In [13]:
baby_phish_test = Datasets('C:/Users/asemo/CSCI 3832/datasets/PhishingURLs/test.csv' ,False, size=2000)
baby_phish_train = Datasets('C:/Users/asemo/CSCI 3832/datasets/PhishingURLs/train.csv' ,False, size=2000)
baby_phish_valid = Datasets(final_data = baby_phish_train.split(), data_processed=True)

baby_phish_train_dataloader = DataLoader(baby_phish_train, batch_size= 8, shuffle=True, collate_fn = gpt_tokenizer)
baby_phish_valid_dataloader = DataLoader(baby_phish_valid, batch_size=8, shuffle=False, collate_fn = gpt_tokenizer)
baby_phish_test_dataloader =  DataLoader(baby_phish_test, batch_size=8, shuffle=False, collate_fn= gpt_tokenizer)



phising datasets now

In [11]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_phish_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(gpt2, baby_phish_train_dataloader, baby_phish_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(gpt2, baby_phish_test_dataloader, device)


Training  0  Epoch


100%|██████████| 200/200 [00:13<00:00, 14.78it/s]


training accuracy for epoch  0 :  0.715
training loss for epoch  0 :  0.5449824963137507
Evaluating


100%|██████████| 50/50 [00:00<00:00, 56.97it/s]


eval accuracy for epoch  0 :  0.8
eval loss for epoch  0 :  0.4013801185786724
Training  1  Epoch


100%|██████████| 200/200 [00:13<00:00, 14.94it/s]


training accuracy for epoch  1 :  0.80125
training loss for epoch  1 :  0.4270831512287259
Evaluating


100%|██████████| 50/50 [00:00<00:00, 59.17it/s]


eval accuracy for epoch  1 :  0.8
eval loss for epoch  1 :  0.4013801185786724
Training  2  Epoch


100%|██████████| 200/200 [00:12<00:00, 15.46it/s]


training accuracy for epoch  2 :  0.80625
training loss for epoch  2 :  0.41511764351278546
Evaluating


100%|██████████| 50/50 [00:00<00:00, 59.17it/s]


eval accuracy for epoch  2 :  0.8
eval loss for epoch  2 :  0.4013801185786724
Evaluating


100%|██████████| 250/250 [00:04<00:00, 59.50it/s]


In [12]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

acc: 0.809
True pos 0.363
True neg 0.446
False pos 0.13
False neg 0.061


Similar preformance on the phishing baby dataset. 80% accuracy with an 13% false postive rate. 

In [8]:

torch.cuda.empty_cache()

berto = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

bert.to(device)

print (device,"being_used")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda being_used


Time for bert

In [9]:

bertoken = _tokenize(berto)

baby_spam_train_dataloader = DataLoader(baby_spam_train, batch_size= 8, shuffle=True, collate_fn = bertoken)
baby_spam_valid_dataloader = DataLoader(baby_spam_valid, batch_size=8, shuffle=False, collate_fn = bertoken)
baby_spam_test_dataloader =  DataLoader(baby_spam_test, batch_size= 8, shuffle=False, collate_fn= bertoken)

In [10]:

optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_spam_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, baby_spam_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(bert, baby_spam_test_dataloader, device)

Training  0  Epoch


100%|██████████| 200/200 [01:33<00:00,  2.14it/s]


training accuracy for epoch  0 :  0.839375
training loss for epoch  0 :  0.36200392998754977
Evaluating


100%|██████████| 50/50 [00:08<00:00,  6.01it/s]


eval accuracy for epoch  0 :  0.9075
eval loss for epoch  0 :  0.23575553093105556
Training  1  Epoch


100%|██████████| 200/200 [01:35<00:00,  2.09it/s]


training accuracy for epoch  1 :  0.956875
training loss for epoch  1 :  0.13603317678906024
Evaluating


100%|██████████| 50/50 [00:08<00:00,  5.84it/s]


eval accuracy for epoch  1 :  0.9075
eval loss for epoch  1 :  0.23575553093105556
Training  2  Epoch


100%|██████████| 200/200 [01:34<00:00,  2.11it/s]


training accuracy for epoch  2 :  0.954375
training loss for epoch  2 :  0.13723268054425716
Evaluating


100%|██████████| 50/50 [00:08<00:00,  6.03it/s]


eval accuracy for epoch  2 :  0.9075
eval loss for epoch  2 :  0.23575553093105556
Evaluating


100%|██████████| 250/250 [00:40<00:00,  6.12it/s]


In [11]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

acc: 0.9255
True pos 0.419
True neg 0.5065
False pos 0.0555
False neg 0.019


bert blows gpt2 out of the water with a 92% acc and a 5% false postive rate.

In [14]:
baby_phish_train_dataloader = DataLoader(baby_phish_train, batch_size= 8, shuffle=True, collate_fn = bertoken)
baby_phish_valid_dataloader = DataLoader(baby_phish_valid, batch_size=8, shuffle=False, collate_fn = bertoken)
baby_phish_test_dataloader =  DataLoader(baby_phish_test, batch_size=8, shuffle=False, collate_fn= bertoken)

Now for phishing

In [17]:
optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_phish_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, baby_phish_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(bert, baby_phish_test_dataloader, device)

Training  0  Epoch


100%|██████████| 200/200 [00:19<00:00, 10.23it/s]


training accuracy for epoch  0 :  0.831875
training loss for epoch  0 :  0.4041268252208829
Evaluating


100%|██████████| 50/50 [00:08<00:00,  5.89it/s]


eval accuracy for epoch  0 :  0.865
eval loss for epoch  0 :  0.34098183184862135
Training  1  Epoch


100%|██████████| 200/200 [00:19<00:00, 10.05it/s]


training accuracy for epoch  1 :  0.905
training loss for epoch  1 :  0.24948312597349287
Evaluating


100%|██████████| 50/50 [00:08<00:00,  5.87it/s]


eval accuracy for epoch  1 :  0.865
eval loss for epoch  1 :  0.34098183184862135
Training  2  Epoch


100%|██████████| 200/200 [00:19<00:00, 10.30it/s]


training accuracy for epoch  2 :  0.916875
training loss for epoch  2 :  0.25234994273632766
Evaluating


100%|██████████| 50/50 [00:08<00:00,  5.87it/s]


eval accuracy for epoch  2 :  0.865
eval loss for epoch  2 :  0.34098183184862135
Evaluating


100%|██████████| 250/250 [00:07<00:00, 33.93it/s]


In [18]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

acc: 0.891
True pos 0.4405
True neg 0.4505
False pos 0.0525
False neg 0.0565


Another strong preformance by bert, with a 89% acc and a 5% false postive. Conclusion: Bert is by far the better model when compared with a similar sized GPT model.