In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import numpy as np
import pandas as pd
from scipy import special, stats
from datasets import load_dataset, load_from_disk, Dataset

# Load pre-trained model tokenizers (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.pad_token = tokenizer.eos_token
sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Load modelss (weights)
pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2-large', use_cache=False)
pretrained_model.eval()
finetuned_model = GPT2LMHeadModel.from_pretrained('./finetuned_model', use_cache=False)
finetuned_model.eval()
sentiment_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
sentiment_model.eval()

# Load and augment dataframe
train_df = pd.read_csv('./train_data.csv')
train_df['pre_ces'] = np.zeros(len(train_df.index))
train_df['pre_ces'] = train_df['pre_ces'].astype('object')
train_df['ft_ces'] = np.zeros(len(train_df.index))
train_df['ft_ces'] = train_df['pre_ces'].astype('object')
train_df['sentiment_scores'] = np.zeros(len(train_df.index))
train_df['sentiment_scores'] = train_df['sentiment_scores'].astype('object')

In [None]:
# Define methods for extracting cross-entropy and sentiment from the models

def get_tokenized(prompt):
    tokens=tokenizer.encode(prompt)
    tokenized = [tokenizer.decode(t) for t in tokens]
    return tokenized

def get_token_nums(tokens):
    nums = [tokenizer.encode(t) for t in tokens]
    return nums

def get_probs(tokens, model):
    
    #convert to tensor variable
    tokens_tensor = torch.tensor([tokens])
    
    #get predictions
    with torch.no_grad():
        outputs = model(tokens_tensor)
    predictions = outputs[0]
    
    #compile probability distribution outputs
    probs_list = [torch.softmax(predictions[0][i],-1).data.numpy() for i in range(len(predictions[0]))]
        
    return probs_list

def cross_entropy(p, q):
    return -np.sum(special.xlogy(p, q))

def get_cross_entropy(tokens, probs):
    
    #q is the predicted distribution, p is the one-hot vector representing the actual next token
    #q = probs[i][0], p = np.zeros(len(q)), p[tokens[i + 1]] = 1
    
    def hot(a, i):
        a[tokens[i + 1]] = 1
        return a

    ces = [cross_entropy(hot(np.zeros(len(probs[i][0])), i), probs[i][0]) for i in range(len(probs) - 1)]
    return ces

def get_sentiment_scores(text):
    
    tokens = sentiment_tokenizer.encode(text)[:512]
    tokens_tensor = torch.tensor([tokens])
    output = special.softmax(sentiment_model(tokens_tensor)[0][0].detach().numpy())
    
    return output

def label(scores):
    idx = np.argmax(scores)
    if idx == 0:
        return 'Negative'
    elif idx == 1:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
# Process each training example

cont = True

if cont:
    train_df = pd.read_csv('./results.csv')
    for idx in range(len(train_df.index)):
        if pd.isna(train_df.loc[idx, 'sentiment_labels']):
            print('Starting from:', idx)
            start = idx
            break
        else:
            start = 0
else:
    start = 0

print('Starting processing...')
for i in range(start, len(train_df.index), 1):
    
    print(i, '/', len(train_df.index))
    
    prompt = train_df.loc[i, 'text']
    
    tokenized = get_tokenized(prompt)[:1024]
    
    tokens = get_token_nums(tokenized)
    pretrained_probs = get_probs(tokens, pretrained_model)
    finetuned_probs = get_probs(tokens, finetuned_model)
    
    train_df.at[i, 'pre_ces'] = get_cross_entropy(tokens, pretrained_probs)
    train_df.at[i, 'ft_ces'] = get_cross_entropy(tokens, finetuned_probs)
    train_df.at[i, 'sentiment_scores'] = get_sentiment_scores(prompt)
    train_df.at[i, 'sentiment_labels'] = label(train_df.at[i, 'sentiment_scores'])
    
    train_df.to_csv('./results.csv', index=False)
    
print('Finished processing!')  

In [None]:
# Evaluate training examples in batches

batch_size = 8

for i in range(0, len(train_df.index), batch_size):
    
    batch_df = pd.DataFrame(train_df[i:i+batch_size]['text']).reset_index()

    # Tokenize dataset
    print('Tokenizing dataset...')
    dataset = Dataset.from_pandas(batch_df)
    max_length = 1024
    tokenized_dataset=dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, max_length=max_length, padding='max_length'), batched=True)
    train_dataset = tokenized_dataset

    print('Dataset successfully loaded!')
    
    # Define data collator and training arguments

    data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm = False)

    training_args = TrainingArguments(output_dir='.nothing',
                                      optim='adafactor',
                                      per_device_eval_batch_size=1,
                                      num_train_epochs=1,
                                      gradient_checkpointing=True)

    print('Training parameters set!')
    
    # Evaluate examples
    pretrained_trainer = Trainer(model=pretrained_model,
                                 args=training_args,
                                 data_collator=data_collator)

    pre_results = pretrained_trainer.predict(train_dataset)
    
    finetuned_trainer = Trainer(model=finetuned_model,
                                args=training_args,
                                data_collator=data_collator)
    
    ft_results = finetuned_trainer.predict(train_dataset)
    
    for j in range(batch_size):
        
        idx = i + j
        
        text = batch_df.loc[j, 'text']
        tokens = get_tokenized(text)[:1024]
        token_nums = get_token_nums(tokens)
        num_tokens = len(tokens)
        
        pre_tens = torch.tensor([pre_results[0][j][0:num_tokens]])
        ft_tens = torch.tensor([ft_results[0][0][j][0:num_tokens]])
        
        pre_output = torch.softmax(pre_tens, -1).data.numpy()[0].tolist()
        ft_output = torch.softmax(ft_tens, -1).data.numpy()[0].tolist()
        
        train_df.at[idx, 'pre_ces'] = get_cross_entropy(token_nums, pre_output)
        train_df.at[idx, 'ft_ces'] = get_cross_entropy(token_nums, ft_output)
        
        train_df.at[idx, 'sentiment_scores'] = get_sentiment_scores(text)
        train_df.at[idx, 'sentiment_label'] = label(train_df.at[idx, 'sentiment_scores'])
        
        train_df.to_csv('./results.csv', index=False)
        print('Done:', idx)