## GPT-2 and Bert finetuning comparisions


Aaron Semones



In [1]:
import numpy 
import os 
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import io
import transformers
from tqdm import tqdm
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer, AutoTokenizer,  AutoModelForSequenceClassification



Importing libaries 


In [41]:
 # formatting function for phishing dataset
def remove_doublequotes(file_dir): # 
    raw_file_str = ''
    with open(file_dir, 'r', encoding='utf-8') as f:
        raw_file_str = f.read().replace('""', '"')
    with open(file_dir, 'w', encoding='utf-8') as f:
        f.write(raw_file_str)
        
#partition a dictionary
def split_dict (dict1, index):
    dict1c = dict1
    dict1 = dict(list(dict1c.items())[index:])
    dict2 = dict(list(dict1c.items())[:index])
    return dict1, dict2
        
        


Misc. helper functions

In [42]:
device = torch.device("cuda") #sets your device. Code will run at a snails pace without gpu



class Datasets(Dataset):
    def __init__ (self, testpath=None, Emails = None, size =None, final_data = None , data_processed=False): 
        '''testpath is the path of your csv file, emails is a bool for handling oddities with phising ds
        size is the desired num_entries, final_data is for turning already pre-processed data into a dataset for loading (ie validiation set),
        dataprocessed is the bool controlling that functionalty
        
        
        Members:
        
        length is number of examples
        test_set is a dict of the entire set
        set_labels is a list of the labels
        set_text is a list of the text
        
        
        '''
        if (data_processed):
            self.length = len(final_data)
            self.test_set = final_data
            self.set_labels = [self.test_set[x] for x in self.test_set]
            self.set_text = list (self.test_set.keys())
            return
        
        if Emails == False:
            remove_doublequotes(testpath)
        
        
        if (size):
            self.test_set = pd.read_csv(testpath, nrows = size)
        else: 
            self.test_set = pd.read_csv(testpath)
 
        if (Emails == True):
            self.test_set = self.test_set.set_index('text')['label'].to_dict()
        
        else:
            self.test_set['label'] = self.test_set ['label'].apply(lambda x : 2 - x)
            print(self.test_set.head())
            self.test_set =  self.test_set.set_index('text')['label'].to_dict() 
            
        self.length = len (self.test_set)
        
        self.set_labels = [self.test_set[x] for x in self.test_set]
        self.set_text = list (self.test_set.keys())
        
        return
    #homework 3 inspired validation data spilt function
    def split(self, ratio = .8 ):
        index = int(ratio*self.length)
        
        split,self.test_set = split_dict(self.test_set, index)
        
        self.set_labels = self.set_labels[:index]
        self.set_text = self.set_text[:index]
        self.length = len(self.test_set)
        
        return split
    #functions required by pytorch for handling data
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        return {'text': self.set_text[index], 'label': self.set_labels[index]}
    


    


Primary dataset classes for handling data and loading it into pytorch. Code written to be reuseable, allowing for easy testing of different models
 


In [43]:
''' tokenizer class to allow easy swapping of tokenizers
params: use_tokenizer is the tokenizer function of choice
max: the maxiumum sequence length. tokenizer truncates based off this'''

class _tokenize(object):
    def __init__(self,  use_tokenizer, max=512):
         self.use_tokenizer = use_tokenizer
         self.max_sequence_len =max
    #basically just calls the tokenizer, returning embeddings  dict    
    def __call__(self, data):
        text= [x['text'] for x in data]
        label = [x ['label'] for x in data]
        
        
        embeddings = self.use_tokenizer(text=text, return_tensors = "pt", padding = True, truncation= True, max_length = self.max_sequence_len)
        embeddings.update({'labels' : torch.tensor(label)})
        return embeddings
#helper to solve for accuracy, true pos, true neg, false pos, false neg
# takes in the actual labels and a series of predictions
#outputs array of stats
def calculate_stats(labels, predictions):
    acc = 0.0
    fp =0.0
    fn = 0.0
    tp =0.0
    tn = 0.0
    size = len(labels)
    counter = 0
    
    for x in labels:
        if x == 1 and predictions[counter] == 1:
            tp+=1
            acc +=1
        elif x == 0 and predictions[counter] == 0:
            tn+=1
            acc+=1
        elif x == 1:
            fp  +=1
        elif x == 0:
            fn +=1 
        counter +=1
            
    return [acc/size, tp/size, tn/size, fp/size, fn/size]
#evaluates the model 
def test (model, data, device, ):
    print ("Evaluating")
    
    total_loss = 0.0
    predictions = []
    labels= []
   
    model.eval()
    for batch in tqdm (data, total=len(data)):
        labels += batch['labels'].numpy().flatten().tolist()

        batch = {i:j.type(torch.long).to(device) for i,j in batch.items()}
        with torch.no_grad():

           
            model_out = model(**batch)
            loss,logits =model_out[:2]
            total_loss += loss.item()
        
            logits = logits.detach().cpu().numpy() 
        
            predictions  += logits.argmax(axis = -1).flatten().tolist()
    total_loss = total_loss/len(data)
    
  
    stats= calculate_stats (labels, predictions)        
       
    return total_loss, stats

Tokenizer class and helpers for our training loop + eval function

In [44]:
'''Training loop  
Params: model, data (your training data), valid (your valid data), optimizer, scheduler, device, epochs'''



def train(model, data, valid, optimizer, scheduler, device, epochs=1):
    
  
    avg_loss_per_epoch = list ()
    acc_t = []
    tp_t = []
    tn_t = []
    fp_t = []
    fn_t = []
    
    acc_v = []
    tp_v = []
    tn_v = []
    fp_v = []
    fn_v = []
    
    v_loss_t = list ()
    for i in range(epochs):
        print ("Training " ,i, " Epoch")
        total_loss = 0
        predictions = []
        labels = []
        v_loss = []
        model.train()
        for batch in tqdm (data, total=len(data)):
            labels += batch['labels'].numpy().flatten().tolist()
            batch = {i:j.type(torch.long).to(device) for i,j in batch.items()}

            model.zero_grad()
        
        
            model_out = model(**batch)
        
            loss_obj, logits = model_out[:2]
    
        
            total_loss += loss_obj.item()
        
            loss_obj.backward()
        
        
        
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
            optimizer.step()
            scheduler.step()
            
            
            logits = logits.detach().cpu().numpy()
        
            predictions  += logits.argmax(axis = -1).flatten().tolist()
  
        stats= calculate_stats (labels, predictions)
        
        avg_loss_per_epoch.append(total_loss/len(data))
        
        print ("training accuracy for epoch ", i ,": ", stats[0])
        print ("training loss for epoch ", i ,": ",total_loss/len(data))
        acc_t.append(stats[0])
        tp_t.append(stats[1])
        tn_t.append(stats[2])
        fp_t.append(stats[3])
        fn_t.append(stats[4])
        
  
        v_loss, stats_v = test(model, valid, device)
        
        
        
        acc_v.append(stats_v[0])
        tp_v.append(stats_v[1])
        tn_v.append(stats_v[2])
        fp_v.append(stats_v[3])
        fn_v.append(stats_v[4])
        
        v_loss_t.append(v_loss)
        
        print ("eval accuracy for epoch ", i ,": ",stats_v[0])
        print ("eval loss for epoch ", i ,": ",v_loss)
      
    t_stats = [avg_loss_per_epoch, acc_t, tp_t, tn_t, fp_t, fn_t]
    v_stats = [v_loss_t, acc_v, tp_v, tn_v, fp_v, fn_v]
    return  t_stats, v_stats

        


    

training loop


In [9]:
model = 'distilgpt2'
config = GPT2Config.from_pretrained(pretrained_model_name_or_path = model, num_labels =2, fp16 = True, num_workers = 4)
tokenzier = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path = model)
tokenzier.padding_side = "left"
tokenzier.pad_token = '50256'
gpt2 = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path = model, config = config )
gpt2.resize_token_embeddings(len(tokenzier))
gpt2.config.pad_token_id = gpt2.config.eos_token_id
gpt2.to(device)

'''tokenzier = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2 = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

gpt2.to(device)
print (device,"being_used")
'''

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'tokenzier = AutoTokenizer.from_pretrained("bert-base-uncased")\ngpt2 = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)\n\ngpt2.to(device)\nprint (device,"being_used")\n'

Inializing our first model, gpt2. gpt2 is a decoder based transformer, which makes it superior at text generation, but generally worse at classification tasks. I used distilgpt2, as gpt2-medium was 500mb too large for my gpus memory. Feel free to go up in complexity if you can run it

In [None]:
baby_spam_test = Datasets(testpath='./datasets/SpamHam/test.csv' ,Emails=True, size = 2000)
baby_spam_train = Datasets(testpath='./datasets/SpamHam/train.csv' ,Emails=True, size = 2000)
baby_spam_valid = Datasets(final_data = baby_spam_train.split(), data_processed=True)
gpt_tokenizer = _tokenize(tokenzier)

baby_spam_train_dataloader = DataLoader(baby_spam_train, batch_size= 8, shuffle=True, collate_fn = gpt_tokenizer)
baby_spam_valid_dataloader = DataLoader(baby_spam_valid, batch_size=8, shuffle=False, collate_fn = gpt_tokenizer)
baby_spam_test_dataloader =  DataLoader(baby_spam_test, batch_size= 8, shuffle=False, collate_fn= gpt_tokenizer)

using a baby dataset for the inital comparisions between the models. Batch size is 'only' 8, as that was the highest I could go without graident checkpointing.

In [None]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_spam_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(gpt2, baby_spam_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(gpt2, baby_spam_test_dataloader, device)

training printouts. eval is broken for some reason, only records the first evaluation and loss. Final printouts from the test set are below 

In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

GPT2 delivers a respectable test set average on the babyset. 75% accuracy with a 20% false positive rate


In [None]:
baby_phish_test = Datasets('./datasets/PhishingURLs/test.csv' ,False, size=2000)
baby_phish_train = Datasets('./datasets/PhishingURLs/train.csv' ,False, size=2000)
baby_phish_valid = Datasets(final_data = baby_phish_train.split(), data_processed=True)

baby_phish_train_dataloader = DataLoader(baby_phish_train, batch_size= 8, shuffle=True, collate_fn = gpt_tokenizer)
baby_phish_valid_dataloader = DataLoader(baby_phish_valid, batch_size=8, shuffle=False, collate_fn = gpt_tokenizer)
baby_phish_test_dataloader =  DataLoader(baby_phish_test, batch_size=8, shuffle=False, collate_fn= gpt_tokenizer)



   label                                               text
0      0                       http://minsotc.alania.gov.ru
1      0                       http://www.freejavaguide.com
2      1  http://yeneliswa.co.za/moods/bankofamerica/7dd...
3      0  https://victordahdalehfoundation.com/programme...
4      0          http://camphhsi.com/product/list_947.html
   label                                               text
0      0                        https://blog.sockpuppet.us/
1      0                  https://blog.apiki.com/seguranca/
2      1  http://autoecole-lauriston.com/a/T0RVd056QXlNe...
3      1  http://chinpay.site/index.html?hgcFSE@E$Z*DFcG...
4      0  http://www.firstfivenebraska.org/blog/article/...


phising datasets now

In [None]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_phish_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(gpt2, baby_phish_train_dataloader, baby_phish_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(gpt2, baby_phish_test_dataloader, device)


In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

Similar preformance on the phishing baby dataset. 80% accuracy with an 13% false postive rate. 

In [6]:

torch.cuda.empty_cache()

berto = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModelForSequenceClassification.from_pretrained("C:/Users/asemo/CSCI 3832/CSCI3832_Project/models")

bert.to(device)

print (device,"being_used")


cuda being_used


Time for bert

In [7]:

bertoken = _tokenize(berto)

baby_spam_train_dataloader = DataLoader(baby_spam_train, batch_size= 8, shuffle=True, collate_fn = bertoken)
baby_spam_valid_dataloader = DataLoader(baby_spam_valid, batch_size=8, shuffle=False, collate_fn = bertoken)
baby_spam_test_dataloader =  DataLoader(baby_spam_test, batch_size= 8, shuffle=False, collate_fn= bertoken)

NameError: name 'baby_spam_train' is not defined

In [None]:

optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_spam_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, baby_spam_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(bert, baby_spam_test_dataloader, device)

In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

bert blows gpt2 out of the water with a 92% acc and a 5% false postive rate.

In [None]:
baby_phish_train_dataloader = DataLoader(baby_phish_train, batch_size= 8, shuffle=True, collate_fn = bertoken)
baby_phish_valid_dataloader = DataLoader(baby_phish_valid, batch_size=8, shuffle=False, collate_fn = bertoken)
baby_phish_test_dataloader =  DataLoader(baby_phish_test, batch_size=8, shuffle=False, collate_fn= bertoken)

Now for phishing

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(baby_phish_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, baby_phish_train_dataloader, baby_spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(bert, baby_phish_test_dataloader, device)

In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

Another strong preformance by bert, with a 89% acc and a 5% false postive. Conclusion: Bert is by far the better model when compared with a slighter smaller sized GPT-2 model.

Since we have found the best preforming model, lets try a run with the fulldatasets

In [8]:

spam_test = Datasets(testpath='C:/Users/asemo/CSCI 3832/datasets/SpamHam/test.csv' ,Emails=True)
spam_train = Datasets(testpath='C:/Users/asemo/CSCI 3832/datasets/SpamHam/train.csv' ,Emails=True)
spam_valid = Datasets(final_data = spam_train.split(), data_processed=True)


spam_train_dataloader = DataLoader(spam_train, batch_size= 8, shuffle=True, collate_fn = bertoken)
spam_valid_dataloader = DataLoader(spam_valid, batch_size=8, shuffle=False, collate_fn = bertoken)
spam_test_dataloader =  DataLoader(spam_test, batch_size= 8, shuffle=False, collate_fn= bertoken)

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(spam_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, spam_train_dataloader, spam_valid_dataloader, optimizer, scheduler, device,3)

v_loss, stats_test = test(bert, spam_test_dataloader, device)

In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])

In [None]:
bert.save_pretrained('../models/')

In [7]:
bertoken = _tokenize(berto)
phish_test = Datasets('C:/Users/asemo/CSCI 3832/datasets/PhishingURLs/test.csv' ,False)
phish_train = Datasets('C:/Users/asemo/CSCI 3832/datasets/PhishingURLs/train.csv' ,False)
phish_valid = Datasets(final_data = phish_train.split(), data_processed=True)

phish_train_dataloader = DataLoader(phish_train, batch_size= 4, shuffle=True, collate_fn = bertoken)
phish_valid_dataloader = DataLoader(phish_valid, batch_size=4, shuffle=False, collate_fn = bertoken)
phish_test_dataloader =  DataLoader(phish_test, batch_size=4, shuffle=False, collate_fn= bertoken)

In [10]:
bertoken = _tokenize(berto)
homebrew_ds = Datasets("C:/Users/asemo/CSCI 3832/CSCI3832_Project/HomebrewDataset.csv", Emails=True)
homebrew_test = DataLoader(homebrew_ds, batch_size = 1, shuffle = False, collate_fn=bertoken)
optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps= 1e-8)
steps = len(homebrew_test)
v_loss, stats_test3 = test(bert, homebrew_test, device)
print ("acc:", stats_test3[0])
print ("True pos", stats_test3[1])
print ("True neg", stats_test3[2])
print ("False pos", stats_test3[3])
print ("False neg", stats_test3[4])

Evaluating


100%|██████████| 30/30 [00:00<00:00, 70.51it/s]

acc: 0.9333333333333333
True pos 0.5
True neg 0.43333333333333335
False pos 0.06666666666666667
False neg 0.0





In [8]:
optimizer = torch.optim.AdamW(bert.parameters(), lr= 2e-5, eps = 1e-8)
steps = len(phish_train_dataloader)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = steps)

predictions, loss = train(bert, phish_train_dataloader,  phish_valid_dataloader, optimizer, scheduler, device,1)

v_loss, stats_test = test(bert, phish_test_dataloader, device)

Training  0  Epoch


100%|██████████| 128000/128000 [2:33:06<00:00, 13.93it/s] 


training accuracy for epoch  0 :  0.9719843202818755
training loss for epoch  0 :  0.13220235812806158
Evaluating


100%|██████████| 32000/32000 [08:43<00:00, 61.11it/s]


eval accuracy for epoch  0 :  0.981859375
eval loss for epoch  0 :  0.09004282037388169
Evaluating


100%|██████████| 40000/40000 [10:52<00:00, 61.26it/s]


In [9]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])


for i in range (len(stats_test)):
    stats_test[i] = stats_test[i]*len(phish_test_dataloader)
    
precision = stats_test[2]/(stats_test[2]+ stats_test[4])
recall = stats_test[2]/ (stats_test[2]+ stats_test[3])
fone= (2*precision*recall)/(precision+recall)
print (fone)

acc: 0.98195625
True pos 0.4890125
True neg 0.49294375
False pos 0.0109875
False neg 0.00705625


In [12]:
v_loss, stats_test = test(bert, spam_test_dataloader, device)

Evaluating


100%|██████████| 7266/7266 [20:26<00:00,  5.93it/s]


In [None]:
print ("acc:", stats_test[0])
print ("True pos", stats_test[1])
print ("True neg", stats_test[2])
print ("False pos", stats_test[3])
print ("False neg", stats_test[4])



for i in range (len(stats_test)):
    stats_test[i] = stats_test[i]*len(spam_test_dataloader)
    
precision = stats_test[2]/(stats_test[2]+ stats_test[4])
recall = stats_test[2]/ (stats_test[2]+ stats_test[3])
fone= (2*precision*recall)/(precision+recall)
print (fone)

acc: 0.9882664647993944
True pos 0.4675349253320487
True neg 0.5207315394673456
False pos 0.009152845640355104
False neg 0.002580689560250499
0.9888591217982228
