In [1]:
!git clone https://github.com/sosamandara/token_reduction_nlp.git
!pip install transformers datasets pandas

In [60]:
%cd /content/token_reduction_nlp/notebooks

/content/token_reduction_nlp/notebooks


In [61]:
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Assuming the notebooks are in the notebooks directory and executed from there
project_root = os.path.abspath(os.path.join(current_dir, '..'))

# Add the models and src directories to the Python path
models_dir = os.path.join(project_root, 'models')
src_dir = os.path.join(project_root, 'src')

sys.path.append(models_dir)
sys.path.append(src_dir)

# Verify that the paths are correctly added
print("Current directory:", current_dir)
print("Project root directory:", project_root)
print("Models directory added to sys.path:", models_dir in sys.path)
print("Src directory added to sys.path:", src_dir in sys.path)

Current directory: /content/token_reduction_nlp/notebooks
Project root directory: /content/token_reduction_nlp
Models directory added to sys.path: True
Src directory added to sys.path: True


In [None]:
!pip install datasets
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from modeling_topK_gpt2 import CustomGPT2LMHeadModel
from collections import defaultdict
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
from functions import load_custom_model
import numpy as np
from tqdm import tqdm
import pandas as pd

In [22]:
def calculate_reduction(length, mask):
    actual_length = length
    for percentage in mask:
        actual_length = (actual_length - int(actual_length * percentage))
    return actual_length + 1

def evaluate_perplexity_for_query(model, encodings, window_size):
    seq_len = encodings.input_ids.size(1)
    log_probs = []
    device = 'cuda'

    for begin_loc in range(0, seq_len, window_size):
        end_loc = min(begin_loc + window_size, seq_len)
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        if input_ids.size(1) < window_size:
            break

        input_window = input_ids[:, :-1]  # Input window without the target token
        target_token_id = input_ids[0, -1].item()  # The target token to predict
        with torch.no_grad():
            output = model(input_window)
            logits = output.logits[:, -1, :]  # Get logits for the last token in the window
            probs = torch.nn.functional.log_softmax(logits, dim=-1)
            log_prob = probs[0, target_token_id].item()
            log_probs.append(log_prob)

    avg_log_prob = np.mean(log_probs)
    perplexity = np.exp(-avg_log_prob)
    return perplexity

def evaluate_perplexity_for_each_query(model, tokenizer, dataset, window_size):
    perplexities = []
    for text in  tqdm(dataset['text'][:10]):
        encodings = tokenizer(text, return_tensors='pt')
        perplexity = evaluate_perplexity_for_query(model, encodings, window_size)
        perplexities.append(perplexity)
    return perplexities


In [10]:
def calculate_reduction(lenght, mask):
  actual_lenght=lenght
  for percentage in mask:
    actual_lenght = (actual_lenght - int(actual_lenght*percentage))
  return actual_lenght + 1

window_sizes = [50, 100, 200, 300, 500, 1000]
remaining_tokens = []
per_keep = []

mask_30 = [0.0, 0.0, 0.0, 0.0, 0.72, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
mask_50 = [0.0, 0.0, 0.0, 0.0, 0.52, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
mask_70 = [0.0, 0.0, 0.0, 0.0, 0.32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
mask_90 = [0.0, 0.0, 0.0, 0.0, 0.12, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

masks = [mask_30, mask_50, mask_70, mask_90]
mask_number = ["30","50","70","90"]

for i, mask in enumerate(masks):
  #remaining_tokens = []
  per_keep = []
  for w in window_sizes:
    #remaining_tokens.append(calculate_reduction(w, mask))
    per_keep.append(round(calculate_reduction(w, mask)/w,2))
    #print(remaining_tokens)
  print("-----",mask_number[i],"-----")
  print(per_keep)

----- 30 -----
[0.3, 0.29, 0.28, 0.28, 0.28, 0.28]
----- 50 -----
[0.5, 0.49, 0.48, 0.48, 0.48, 0.48]
----- 70 -----
[0.7, 0.69, 0.69, 0.68, 0.68, 0.68]
----- 90 -----
[0.9, 0.89, 0.89, 0.88, 0.88, 0.88]


In [15]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model_gpt = GPT2LMHeadModel.from_pretrained(model_name, config=config).to('cuda')

# Layers to prune
layers_to_prune = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

# Load custom models with different k_percent values
custom_models = {}
for i, mask in enumerate(masks):
    model_id = f"topK_{mask_number[i]}"
    custom_models[model_id] = load_custom_model(model_name, config, mask, selection_method="top_k", layers_to_prune=layers_to_prune)

In [49]:
def evaluate_perplexity_for_query(model, encodings):
    input_ids = encodings.input_ids.to('cuda')
    seq_len = input_ids.size(1)
    log_probs = []
    predictions = []

    for i in range(seq_len - 1):
        with torch.no_grad():
            output = model(input_ids[:, :i + 1])
            logits = output.logits[:, -1, :]  # Get logits for the last token
            probs = torch.nn.functional.log_softmax(logits, dim=-1)
            target_token_id = input_ids[0, i + 1].item()
            log_prob = probs[0, target_token_id].item()
            log_probs.append(log_prob)

            # Get the predicted token
            predicted_token_id = torch.argmax(probs, dim=-1).item()
            predictions.append(predicted_token_id)

    avg_log_prob = np.mean(log_probs)
    perplexity = np.exp(-avg_log_prob)
    predicted_text = tokenizer.decode(predictions)
    return perplexity, predicted_text

def evaluate_perplexity_for_each_query(model, tokenizer, dataset):
    results = []
    for text in tqdm(dataset['text'][:100]):  # Process only the first 10 entries for now
        encodings = tokenizer(text, return_tensors='pt')
        perplexity, predicted_text = evaluate_perplexity_for_query(model, encodings)
        results.append({
            'query': text,
            'perplexity': perplexity,
            'prediction': predicted_text
        })
    return results


In [18]:
imdb_dataset = load_dataset('imdb', split='test')

In [50]:
results = {}
models = [model_gpt] + list(custom_models.values())
model_names = ['gpt2'] + [f'custom_{mask_number[i]}%' for i in range(len(custom_models))]

# Evaluate and save perplexities for the first 10 queries
all_results = []
for model, model_name in zip(models, model_names):
    print(f"Evaluating perplexity for each query using {model_name}")
    model_results = evaluate_perplexity_for_each_query(model, tokenizer, imdb_dataset)
    for result in model_results:
        result['model'] = model_name
    all_results.extend(model_results)

# Convert the results to a DataFrame and display
df_results = pd.DataFrame(all_results)
print(df_results)


# Optionally save to CSV
output_path = 'perplexity_results_imdb_first_10.csv'
df_results.to_csv(output_path, index=False)

print(f"Saved perplexity results to {output_path}")

# Display the first few results for inspection
df_results.head()

Evaluating perplexity for each query using gpt2


100%|██████████| 100/100 [11:27<00:00,  6.88s/it]


Evaluating perplexity for each query using custom_30%


100%|██████████| 100/100 [10:37<00:00,  6.38s/it]


Evaluating perplexity for each query using custom_50%


100%|██████████| 100/100 [11:32<00:00,  6.92s/it]


Evaluating perplexity for each query using custom_70%


100%|██████████| 100/100 [12:46<00:00,  7.66s/it]


Evaluating perplexity for each query using custom_90%


100%|██████████| 100/100 [14:02<00:00,  8.42s/it]

                                                 query  ...       model
0    I love sci-fi and am willing to put up with a ...  ...        gpt2
1    Worth the entertainment value of a rental, esp...  ...        gpt2
2    its a totally average film with a few semi-alr...  ...        gpt2
3    STAR RATING: ***** Saturday Night **** Friday ...  ...        gpt2
4    First off let me say, If you haven't enjoyed a...  ...        gpt2
..                                                 ...  ...         ...
495  This film seems well made, and more efforts sh...  ...  custom_90%
496  It hurt to watch this movie, it really did... ...  ...  custom_90%
497  Rita Hayworth is just stunning at times and, f...  ...  custom_90%
498  Like 'Singin' in the Rain', 'Cover Girl' has a...  ...  custom_90%
499  Rita Hayworth plays a Brooklyn nightclub dance...  ...  custom_90%

[500 rows x 4 columns]
Saved perplexity results to perplexity_results_imdb_first_10.csv





Unnamed: 0,query,perplexity,prediction,model
0,I love sci-fi and am willing to put up with a ...,44.072236,". the-fi, fantasy a to give my with it lot of ...",gpt2
1,"Worth the entertainment value of a rental, esp...",45.943925,. a trouble value of the good car the if you'r...,gpt2
2,its a totally average film with a few semi-alr...,55.653095,", lot different,. a lot exceptions-interesting...",gpt2
3,STAR RATING: ***** Saturday Night **** Friday ...,48.609417,"\nANK: 4*\n, Live* Night ****\n Night *** Satu...",gpt2
4,"First off let me say, If you haven't enjoyed a...",28.90719,",,'s say that I you are't already the goodishi...",gpt2


In [59]:
df_results[df_results["model"]=="gpt2%"]

Unnamed: 0,query,perplexity,prediction,model


In [88]:
df_results = pd.read_csv("perplexity_results_imdb_first_100.csv")

In [76]:
df_results

Unnamed: 0,query,perplexity,prediction,model
0,I love sci-fi and am willing to put up with a ...,44.072236,". the-fi, fantasy a to give my with it lot of ...",gpt2
1,"Worth the entertainment value of a rental, esp...",45.943925,. a trouble value of the good car the if you'r...,gpt2
2,its a totally average film with a few semi-alr...,55.653095,", lot different,. a lot exceptions-interesting...",gpt2
3,STAR RATING: ***** Saturday Night **** Friday ...,48.609417,"\nANK: 4*\n, Live* Night ****\n Night *** Satu...",gpt2
4,"First off let me say, If you haven't enjoyed a...",28.907190,",,'s say that I you are't already the goodishi...",gpt2
...,...,...,...,...
495,"This film seems well made, and more efforts sh...",43.413178,"is is to-, but I than were be made to make it...",custom_90%
496,"It hurt to watch this movie, it really did... ...",56.897496,"is to be the.. but hurt hurt.\n'm to see it, ...",custom_90%
497,"Rita Hayworth is just stunning at times and, f...",38.710037,".,ashi, a one. her. she as the, is best thing ...",custom_90%
498,"Like 'Singin' in the Rain', 'Cover Girl' has a...",26.915310,theTheapore' in the Rain' theThe Up' and been...,custom_90%


In [82]:
# Ensure 'index' column is present
df_results.reset_index(drop=True, inplace=True)

# Create a new DataFrame to hold the results with perplexity ratios
df_ratios = df_results.copy()

# Calculate the perplexity ratio for custom models compared to GPT-2
gpt2_perplexities = df_results.loc[0:99, 'perplexity'].values
mask_number = ["30", "50", "70", "90"]
for i, mask in enumerate(mask_number):
    start = (i + 1) * 100
    end = start + 100
    custom_model_name = f'custom_{mask}%'
    df_ratios.loc[start:end-1, 'perplexity_ratio'] = df_results.loc[start:end-1, 'perplexity'].values / gpt2_perplexities

# Save the updated DataFrame to a new CSV file
output_path = 'perplexity_results_imdb_first_100_with_ratio.csv'
df_ratios.to_csv(output_path, index=False)

print(f"Saved perplexity results to {output_path}")

# Display the first few results for inspection
df_ratios.head(20)

Saved perplexity results to perplexity_results_imdb_first_100_with_ratio.csv


Unnamed: 0,query,perplexity,prediction,model,perplexity_ratio
0,I love sci-fi and am willing to put up with a ...,44.072236,". the-fi, fantasy a to give my with it lot of ...",gpt2,
1,"Worth the entertainment value of a rental, esp...",45.943925,. a trouble value of the good car the if you'r...,gpt2,
2,its a totally average film with a few semi-alr...,55.653095,", lot different,. a lot exceptions-interesting...",gpt2,
3,STAR RATING: ***** Saturday Night **** Friday ...,48.609417,"\nANK: 4*\n, Live* Night ****\n Night *** Satu...",gpt2,
4,"First off let me say, If you haven't enjoyed a...",28.90719,",,'s say that I you are't already the goodishi...",gpt2,
5,I had high hopes for this one until they chang...,55.592222,. to hopes for the game. I started the name to...,gpt2,
6,Isaac Florentine has made some of the best wes...,31.637921,".,ntino, been a interesting the most signings ...",gpt2,
7,"It actually pains me to say it, but this movie...",28.464548,"is is me to say this, but I is is a. so level...",gpt2,
8,"Technically I'am a Van Damme Fan, or I was. th...",85.897566,".,'mM not little Morrisonme,, but a'm a I is i...",gpt2,
9,"Honestly awful film, bad editing, awful lighti...",27.254494,",.. but acting, and sound, andly, badppy dialo...",gpt2,


In [108]:
df_ratios.to_csv("perplexity_results_imdb_first_100_with_ratio.csv", index=False)

In [111]:
# Ensure the DataFrame uses its default integer index
df_ratios.reset_index(drop=True, inplace=True)

# Function to find the row with the highest and lowest ratio for a given model
def find_extremes(df, model_name):
    df_model = df[df['model'] == model_name]
    max_row = df_model.loc[df_model['perplexity_ratio'].idxmax()]
    min_row = df_model.loc[df_model['perplexity_ratio'].idxmin()]
    return max_row, min_row

# Initialize a list to store the results
extreme_rows = []

# Iterate through each custom model to find the extremes
mask_number = ["30", "50", "70", "90"]
for mask in mask_number:
    custom_model_name = f'custom_{mask}%'
    max_row, min_row = find_extremes(df_ratios, custom_model_name)
    extreme_rows.append(('max', custom_model_name, max_row))
    extreme_rows.append(('min', custom_model_name, min_row))

# Create a DataFrame to display the extreme rows
df_extremes = pd.DataFrame([{
    'extreme': extreme,
    'model': model,
    'query': row['query'],
    'perplexity': row['perplexity'],
    'prediction': row['prediction'],
    'perplexity_ratio': row['perplexity_ratio']
} for extreme, model, row in extreme_rows])

print(df_extremes)

# Optionally save to CSV
output_extremes_path = 'perplexity_extremes_imdb.csv'
df_extremes.to_csv(output_extremes_path, index=False)

print(f"Saved extremes results to {output_extremes_path}")

# Display the results
df_extremes.head(20)

  extreme       model  ...                                         prediction  perplexity_ratio
0     max  custom_30%  ...  ...,, aic to to a Heaves,,ir,, is't have sense...          4.655061
1     min  custom_30%  ...  ...'m'm think a comments about the.\n I I is a...          1.144150
2     max  custom_50%  ...  ..gets, a a his his consultantman Heaves,-ir,,...          3.284816
3     min  custom_50%  ...  ,,'m't believe how much could be a a a photo a...          1.070361
4     max  custom_70%  ...   is makes worse worse out it it's Beckhamselse...          1.981920
5     min  custom_70%  ...  , I'm't believe how much could be a a person p...          1.013425
6     max  custom_90%  ...  \n and film, in the 1970glglades\n of, togethe...          1.568961
7     min  custom_90%  ...   is is a of the favorite characters pleasures:...          0.978123

[8 rows x 6 columns]
Saved extremes results to perplexity_extremes_imdb.csv


Unnamed: 0,extreme,model,query,perplexity,prediction,perplexity_ratio
0,max,custom_30%,Widow hires a psychopath as a handyman. Sloppy...,489.718137,"...,, aic to to a Heaves,,ir,, is't have sense...",4.655061
1,min,custom_30%,generally speaking I don't make negative comme...,45.786088,...'m'm think a comments about the.\n I I is a...,1.14415
2,max,custom_50%,Widow hires a psychopath as a handyman. Sloppy...,345.566654,"..gets, a a his his consultantman Heaves,-ir,,...",3.284816
3,min,custom_50%,"Seriously, I can't imagine how anyone could fi...",38.168736,",,'m't believe how much could be a a a photo a...",1.070361
4,max,custom_70%,This only gets bashed because it stars David H...,101.755589,is makes worse worse out it it's Beckhamselse...,1.98192
5,min,custom_70%,"Seriously, I can't imagine how anyone could fi...",36.138432,", I'm't believe how much could be a a person p...",1.013425
6,max,custom_90%,An obscure horror show filmed in the Everglade...,82.949497,"\n and film, in the 1970glglades\n of, togethe...",1.568961
7,min,custom_90%,This film features two of my favorite guilty p...,76.759398,is is a of the favorite characters pleasures:...,0.978123


In [119]:
df_extremes.iloc[0].query

"Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)"

In [118]:
df_extremes.iloc[2].prediction

"..gets, a a his his consultantman Heaves,-ir,, is't have sense sense a own., piecespieces,\nI)10)"