In [1]:
# prompt: Add Drive

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import random
import numpy as np
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Loading Tokenizer
def load_tokenizer(args):
    tokenizer = GPT2Tokenizer.from_pretrained(args['model_name'])
    special_tokens = ['<question>', '<answer>']
    tokenizer.add_special_tokens({
        'bos_token': '<bos>',
        'additional_special_tokens': special_tokens
    })

    # add new token ids to args
    special_tokens += ['<bos>', '<eos>']
    question, answer, bos_id, eos_id = tokenizer.encode(special_tokens)
    args['question'] = question
    args['answer'] = answer
    args['bos_id'] = bos_id
    args['eos_id'] = eos_id

    return tokenizer

# Loading Model
def load_model(args, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args['device'] = device
    print("--"*50)
    print(f'Using device: {device}')
    print("--"*50)
    model = GPT2LMHeadModel.from_pretrained(args["model_name"]).to(device)
    model.resize_token_embeddings(len(tokenizer))
    return model

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


Mounted at /content/drive


In [2]:
import pandas as pd
# Load the TSV dataset
file_path = '/content/drive/MyDrive/GPT2 Model/dialogues_eda.tsv'
df = pd.read_csv(file_path, sep='\t')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,question,answer
0,"Hi, How are you doing?",I am fine. How about yourself?
1,I am fine. How about yourself?,I am pretty good. Thanks for asking.
2,I am pretty good. Thanks for asking.,No problem. So how have you been?
3,No problem. So how have you been?,I have been great. What about you?
4,I have been great. What about you?,I have been good. I am in school right now.


In [3]:
args = {'model_name': 'gpt2'}
tokenizer = load_tokenizer(args)
# tokenizer.clean_up_tokenization_spaces = False
def encode_data(sentence):
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(tokens)
    return token_ids

df_ids = df.map(encode_data)
df_ids.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Unnamed: 0,question,answer
0,"[17250, 11, 1374, 389, 345, 1804, 30]","[40, 716, 3734, 13, 1374, 546, 3511, 30]"
1,"[40, 716, 3734, 13, 1374, 546, 3511, 30]","[40, 716, 2495, 922, 13, 6930, 329, 4737, 13]"
2,"[40, 716, 2495, 922, 13, 6930, 329, 4737, 13]","[2949, 1917, 13, 1406, 703, 423, 345, 587, 30]"
3,"[2949, 1917, 13, 1406, 703, 423, 345, 587, 30]","[40, 423, 587, 1049, 13, 1867, 546, 345, 30]"
4,"[40, 423, 587, 1049, 13, 1867, 546, 345, 30]","[40, 423, 587, 922, 13, 314, 716, 287, 1524, 8..."


In [4]:
from sklearn.model_selection import train_test_split

# Assuming df_ids is your DataFrame or list
data_list = df_ids.to_dict(orient='records')

# Split data into train and validation sets
train_data, valid_data = train_test_split(data_list, test_size=0.2, random_state=42)

In [5]:
len(data_list), len(train_data), len(valid_data)

(3725, 2980, 745)

In [6]:
for data in train_data:
  input_ids =[args["bos_id"],args["question"]]+data["question"] +[args['answer']]+ data["answer"] +[args["eos_id"]]
  token_type_ids = [0] * (len(data["question"]) + 3) + [1] * (len(data["answer"]) + 1)
  labels = [-100] * (len(data["question"]) + 3) + data["answer"] +[args["eos_id"]]
  print(input_ids)
  print(token_type_ids)
  print(labels)
  break

[50257, 50258, 8128, 340, 468, 1049, 7799, 13, 50259, 2061, 2073, 30, 50256]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, 2061, 2073, 30, 50256]


In [None]:
from torch.utils.data import Dataset
class DialogueDataset(Dataset): # create your customdataset
    def __init__(self,dataset):
        self.input_ids = []
        self.token_type_ids = []
        self.labels = []
        self._prepare_data(dataset)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.token_type_ids[idx], self.labels[idx]

    def _prepare_data(self,dataset):
        print("Loading train Data...")
        for data in dataset:
            input_ids =[args["bos_id"],args["question"]]+data["question"] +[args['answer']]+ data["answer"] +[args["eos_id"]]
            token_type_ids = [0] * (len(data["question"]) + 3) + [1] * (len(data["answer"]) + 1)
            labels = [-100] * (len(data["question"]) + 3) + data["answer"] +[args["eos_id"]]
            self.input_ids.append(input_ids)
            self.token_type_ids.append(token_type_ids)
            self.labels.append(labels)


In [8]:
from torch.nn.utils.rnn import pad_sequence
class CollateFn: # create your custom collate_fn for padding
    def __init__(self):
        self.n = 0
    def __call__(self,batch):
        eos_id = args['eos_id']
        input_ids, token_type_ids, labels = [], [], []

        for idx, seqs in enumerate(batch):
            input_ids.append(torch.LongTensor(seqs[0]))
            token_type_ids.append(torch.LongTensor(seqs[1]))
            labels.append(torch.LongTensor(seqs[2]))

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=eos_id)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=eos_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=-100)
        return input_ids, token_type_ids, labels

In [9]:
from torch.utils.data import DataLoader
train_dataset = DialogueDataset(train_data)
valid_dataset = DialogueDataset(valid_data)

pad = CollateFn()

train_loader = DataLoader(train_dataset,shuffle=True,batch_size=8, collate_fn=pad, num_workers=0, pin_memory=True)
validation_loader = DataLoader(valid_dataset,shuffle=False,batch_size=8, collate_fn=pad, num_workers=0, pin_memory=True)

Loading train Data...
Loading train Data...


In [10]:
model = load_model(args, tokenizer)

----------------------------------------------------------------------------------------------------
Using device: cuda
----------------------------------------------------------------------------------------------------


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [11]:
from torch.optim import AdamW
from transformers import get_polynomial_decay_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=5e-5)

args['num_epochs'] = 5
args['warmup_ratio'] = 0.1

num_batches = len(train_loader)
total_train_steps = args['num_epochs'] * num_batches
warmup_steps = int(args['warmup_ratio'] * total_train_steps)

scheduler = get_polynomial_decay_schedule_with_warmup(optimizer,
                                                      num_warmup_steps=warmup_steps,
                                                      num_training_steps=total_train_steps,
                                                      power=2)

In [12]:
from tqdm.auto import tqdm
import math
import numpy as np
import matplotlib.pyplot as plt
import time

train_loss_history = []
train_ppx_history = []
valid_loss_history = []
valid_ppx_history = []

start_time = time.time()

best_loss = float('inf')

for epoch in range(args['num_epochs']):
    # Training
    model.train()
    train_losses = []
    train_perplexity = []
    for i, batch in enumerate(tqdm(train_loader)):
        input_ids, token_type_ids, labels = batch
        input_ids = input_ids.to(args['device'])
        token_type_ids = token_type_ids.to(args['device'])
        labels = labels.to(args['device'])

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            labels=labels
        )
        loss, logits = outputs[0], outputs[1]
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_losses.append(loss.detach())
        ppx = torch.exp(loss.detach())
        train_perplexity.append(ppx)

    train_losses = [loss.item() for loss in train_losses]
    train_perplexity = [ppx.item() if not math.isinf(ppx.item()) else 1e+8 for ppx in train_perplexity]
    train_loss = np.mean(train_losses)
    train_ppx = np.mean(train_perplexity)

    train_loss_history.append(train_loss)
    train_ppx_history.append(train_ppx)
    # print(f'Train loss: {train_loss} \nTrain perplexity: {train_ppx}')

    # Validation
    print('Launch validation...')
    model.eval()

    valid_losses = []
    valid_ppxs = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(validation_loader)):
            input_ids, token_type_ids, labels = batch
            input_ids = input_ids.to(args['device'])
            token_type_ids = token_type_ids.to(args['device'])
            labels = labels.to(args['device'])

            outputs = model(
                    input_ids=input_ids,
                    token_type_ids=token_type_ids,
                    labels=labels
            )

            loss, logits = outputs[0], outputs[1]

            valid_losses.append(loss.detach())
            ppx = torch.exp(loss.detach())
            valid_ppxs.append(ppx)

        valid_losses = [loss.item() for loss in valid_losses]
        valid_ppxs = [ppx.item() if not math.isinf(ppx.item()) else 1e+8 for ppx in valid_ppxs]
        valid_loss = np.mean(valid_losses)
        valid_ppx = np.mean(valid_ppxs)

        valid_loss_history.append(valid_loss)
        valid_ppx_history.append(valid_ppx)

        if math.isnan(valid_ppx):
            valid_ppx = 1e+8
        print(f'Train loss: {train_loss} \nTrain perplexity: {train_ppx}')
        print(f'Validation loss: {valid_loss} \nValidation perplexity: {valid_ppx}')

        if valid_loss < best_loss:
            best_loss = valid_loss
            print(f'Best loss: {best_loss}')
        else:
          print("Overfitting")
          break

end_time = time.time()
print(f'Total time: {end_time - start_time}')
# model.save_pretrained("/content/drive/MyDrive/GPT2 Model/pretrained_gpt2")
# tokenizer.save_pretrained("/content/drive/MyDrive/GPT2 Model/pretrained_gpt2")


  0%|          | 0/373 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Launch validation...


  0%|          | 0/94 [00:00<?, ?it/s]

Train loss: 3.1423765762881364 
Train perplexity: 81.63992523060408
Validation loss: 2.4699033105627017 
Validation perplexity: 12.192950451627691
Best loss: 2.4699033105627017


  0%|          | 0/373 [00:00<?, ?it/s]

Launch validation...


  0%|          | 0/94 [00:00<?, ?it/s]

Train loss: 2.308006460800887 
Train perplexity: 10.389591022726679
Validation loss: 2.436806962845173 
Validation perplexity: 11.811529534928342
Best loss: 2.436806962845173


  0%|          | 0/373 [00:00<?, ?it/s]

Launch validation...


  0%|          | 0/94 [00:00<?, ?it/s]

Train loss: 2.030224570000779 
Train perplexity: 7.833478191263235
Validation loss: 2.458754298534799 
Validation perplexity: 12.117310356586538
Overfitting
Total time: 121.83203125


('/content/drive/MyDrive/GPT2 Model/pretrained_gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2/vocab.json',
 '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2/merges.txt',
 '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2/added_tokens.json')

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Sample data (Replace these with actual values)
# train_loss_history = [2.3, 2.1, 1.8, 1.5, 1.2]
# train_ppx_history = [10, 9, 7, 5, 4]
# valid_loss_history = [2.5, 2.3, 2.0, 1.7, 1.4]
# valid_ppx_history = [11, 10, 8, 6, 5]
epochs = list(range(1, len(train_loss_history) + 1))

# Create Subplots with 1 Row, 2 Columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Losses", "Perplexity"))

# Loss Graph (Left Column)
fig.add_trace(go.Scatter(x=epochs, y=train_loss_history, mode='lines+markers', name="Train Loss"), row=1, col=1)
fig.add_trace(go.Scatter(x=epochs, y=valid_loss_history, mode='lines+markers', name="Valid Loss"), row=1, col=1)

# Perplexity Graph (Right Column)
fig.add_trace(go.Scatter(x=epochs, y=train_ppx_history, mode='lines+markers', name="Train Perplexity"), row=1, col=2)
fig.add_trace(go.Scatter(x=epochs, y=valid_ppx_history, mode='lines+markers', name="Valid Perplexity"), row=1, col=2)

# Layout settings
fig.update_layout(title_text="Training & Validation Metrics", showlegend=True)
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=2)
fig.update_yaxes(title_text="Loss", row=1, col=1)
fig.update_yaxes(title_text="Perplexity", row=1, col=2)

# Show the plot
fig.show()


## Inference Time

In [None]:
args = {'model_name': '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2'}
tokenizer = load_tokenizer(args)
# args["model_name"]="/content/drive/MyDrive/GPT2 Model/pretrained_gpt2"
model = load_model(args, tokenizer)
args

----------------------------------------------------------------------------------------------------
Using device: cuda
----------------------------------------------------------------------------------------------------


{'model_name': '/content/drive/MyDrive/GPT2 Model/pretrained_gpt2',
 'question': 50258,
 'answer': 50259,
 'bos_id': 50257,
 'eos_id': 50256,
 'device': device(type='cuda')}

In [14]:
import torch.nn.functional as F
def top_k_filter(logits, top_k=0., threshold=-float('Inf'), filter_value=-float('Inf')):
    assert logits.dim() == 1
    top_k = min(top_k, logits.size(-1))

    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits

In [15]:
args["top_k"] = 10
args["top_p"] = 0.6
args["temp"] = 1
def AskQuestion(sentence):
    input_ids = [args['question']] + tokenizer.encode(sentence)
    input_ids = [args['bos_id']] + input_ids + [args['answer']]
    input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(args['device'])
    token_type_ids = torch.zeros_like(input_ids).to(args['device'])

    responce = ""
    print("Bot:",end="")
    for pos in range(100):
        output = model(input_ids=input_ids, token_type_ids=token_type_ids)[0]
        logits = output[0, -1, :] / args["temp"]
        logits = top_k_filter(logits, top_k=args["top_k"])
        output = F.softmax(logits, dim=-1).unsqueeze(0)
        sorted_probs, sorted_idxs = torch.sort(output, descending=True)
        # sorted_probs, sorted_idxs
        cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
        idx_remove = cumsum_probs > args["top_p"]
        idx_remove[:, 1:] = idx_remove[:, :-1].clone()
        idx_remove[:, 0] = False
        sorted_probs[idx_remove] = 0.0
        sorted_probs /= torch.sum(sorted_probs, dim=-1, keepdim=True)

        probs = torch.zeros(output.shape, device=args['device']).scatter_(-1, sorted_idxs, sorted_probs)
        idx = torch.multinomial(probs, 1)

        idx_item = idx.squeeze(-1).squeeze(-1).item()
        # idx_item

        # if idx_item in output_ids:
        #     continue
        res = tokenizer.decode([idx_item],skip_special_tokens=True)
        print(res,end="")
        responce += res + " "
        # output_ids.append(idx_item)

        if idx_item == args['eos_id']:
            break

        input_ids = torch.cat((input_ids, idx.reshape(1, 1)), dim=-1)
        # print(tokenizer.decode(input_ids[0].cpu().numpy()))
        next_type_id = torch.LongTensor([[args['answer']]]).to(args['device'])
        token_type_ids = torch.cat((token_type_ids, next_type_id), dim=-1)
    return responce
# for i in range(len(output[0])):
#     text = output[0][i].argmax(dim=-1).cpu().numpy()
#     print("_"*50)
#     print(tokenizer.decode(text))

### Evaluation

In [22]:
data_list = df.to_dict(orient='records')
train_data, valid_data = train_test_split(data_list, test_size=0.2, random_state=42)

questions = []
answers = []
for data in valid_data:
  questions.append(data["question"])
  answers.append(data["answer"])




In [23]:
from tqdm import tqdm
import torch

questions = questions
actual_answers = answers
predicted_answers = []

bos_id = args['bos_id']
question_token_id = args['question']
answer_token_id = args['answer']
device = args['device']

model.eval()
for question in tqdm(questions, desc="Predicting Answers"):
    # Tokenize the question normally
    question_ids = tokenizer.encode(question, add_special_tokens=False)

    # Manually add special token IDs
    input_ids = [bos_id, question_token_id] + question_ids + [answer_token_id]
    input_ids = torch.tensor([input_ids], device=device)  # Add batch dim [1, seq_len]

    attention_mask = torch.ones_like(input_ids)  # 2D mask [1, seq_len]

    # Generate answer
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=64,
        pad_token_id=tokenizer.eos_token_id
    )

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    predicted_answers.append(answer)


Predicting Answers: 100%|██████████| 745/745 [01:03<00:00, 11.69it/s]


In [24]:
# !pip install rouge-score bert-score nltk
# !python -m nltk.downloader punkt

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [27]:
import numpy as np
import string
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from collections import Counter

In [26]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [28]:
def normalize_text(text):
    """Lowercase, remove punctuation, and strip whitespace."""
    text = text.strip().lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def exact_match(pred, true):
    """Compute Exact Match (case and punctuation insensitive)."""
    return int(normalize_text(pred) == normalize_text(true))

def compute_token_f1(pred, true):
    """Compute token-level F1 score with word counts (bag-of-words)."""
    pred_tokens = normalize_text(pred).split()
    true_tokens = normalize_text(true).split()

    pred_counter = Counter(pred_tokens)
    true_counter = Counter(true_tokens)
    common_tokens = pred_counter & true_counter
    num_common = sum(common_tokens.values())

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens) if pred_tokens else 0.0
    recall = num_common / len(true_tokens) if true_tokens else 0.0

    if (precision + recall) == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def compute_rouge_l(pred, true):
    """Compute ROUGE-L score."""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(true, pred)
    return scores['rougeL'].fmeasure

def compute_meteor(pred, true):
    """Compute METEOR score with tokenization."""
    pred_tokens = word_tokenize(pred.lower())
    true_tokens = word_tokenize(true.lower())
    return meteor_score([true_tokens], pred_tokens)

In [29]:
# Example data (replace with your actual and predicted answers)
# actual_answers = ["The capital of France is Paris.", "Einstein developed the theory of relativity."]
# predicted_answers = ["Paris is the capital of France.", "Einstein's theory of relativity was groundbreaking."]

# Initialize metric lists
em_scores, f1_scores, rouge_scores, meteor_scores = [], [], [], []

for pred, true in zip(predicted_answers, actual_answers):
    # Exact Match
    em_scores.append(exact_match(pred, true))

    # Token F1
    f1_scores.append(compute_token_f1(pred, true))

    # ROUGE-L
    rouge_scores.append(compute_rouge_l(pred, true))

    # METEOR
    meteor_scores.append(compute_meteor(pred, true))

# BERTScore (computes for all pairs at once)
P, R, F1 = bert_score(predicted_answers, actual_answers, lang='en')
bert_score_f1 = np.mean(F1.numpy())  # Convert tensor to numpy array if needed

# Compute averages
metrics = {
    "Exact Match": np.mean(em_scores),
    "Token F1": np.mean(f1_scores),
    "ROUGE-L": np.mean(rouge_scores),
    "BERTScore F1": bert_score_f1,
    "METEOR": np.mean(meteor_scores)
}

# Print results
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exact Match: 0.0000
Token F1: 0.1220
ROUGE-L: 0.1368
BERTScore F1: 0.8640
METEOR: 0.1687


## Inderence Time

In [32]:
while True:
    question = input("Q: ")
    if question.lower() in ["exit", "quit"]:
        break

    # Tokenize user question
    question_ids = tokenizer.encode(question, add_special_tokens=False)

    # Add special token IDs
    input_ids = [bos_id, question_token_id] + question_ids + [answer_token_id]
    input_ids = torch.tensor([input_ids], device=device)
    attention_mask = torch.ones_like(input_ids)

    # Generate model output
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=64,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the full generated string
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Postprocess: remove question echo if model does that
    if "<answer>" in decoded:
        answer = decoded.split("<answer>")[-1].strip()
    elif question in decoded:
        answer = decoded.split(question)[-1].strip()
    else:
        answer = decoded.strip()

    print(f"A: {answer}\n")


Q: what is your name?
A: I am not sure.

Q: where so you live
A: Where do you live?

Q: where do you live?
A: I live in a small town.

Q: Are you idot?
A: I am not.

Q: what is your real name
A: I am not sure.

Q: exit


In [16]:
# model.eval()
# while True:
#     sentence = input("Enter your question: ")
#     if sentence.lower() == "bye":
#         break
#     print("User: " + sentence)
#     AskQuestion(sentence)
#     print("\n")

Enter your question: How is Going On
User: How is Going On
Bot:I am going to be there.



KeyboardInterrupt: Interrupted by user

Hi, How are you doing?	I am fine. How about yourself?

1	I am fine. How about yourself?	I am pretty good. Thanks for asking.

2	I am pretty good. Thanks for asking.	No problem. So how have you been?

3	No problem. So how have you been?	I have been great. What about you?

4	I have been great. What about you?	I have been good. I am in school right now


In [None]:
# input_texts = df['question'].tolist()
# target_texts = df['answer'].tolist()

# input_texts, target_texts

# Evoluation

In [None]:
# !pip install rouge-score
#

In [None]:
# import nltk
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import torch
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import tqdm

# Ensure you're moving the model and inputs to the same device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device (either GPU or CPU)
model = model.to(device)

# Tokenize inputs for evaluation
def tokenize_input(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    return inputs

# Tokenize text for BLEU score calculation
def tokenize_for_bleu(text, tokenizer):
    return tokenizer.tokenize(text)

# Tokenize for METEOR score
def tokenize_for_meteor(text, tokenizer):
    return text.split()  # Split on spaces to simulate tokenization

# Compute ROUGE scores
def compute_rouge(pred_texts, target_texts):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for pred, target in zip(pred_texts, target_texts):
        score = scorer.score(target, pred)
        rouge_scores.append(score)

    avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

    return avg_rouge1, avg_rouge2, avg_rougeL

# Compute METEOR score
def compute_meteor(pred_texts, target_texts, tokenizer):
    meteor_scores = []
    for pred, target in zip(pred_texts, target_texts):
        pred_tokens = tokenize_for_meteor(pred, tokenizer)  # Pre-tokenize prediction
        target_tokens = tokenize_for_meteor(target, tokenizer)  # Pre-tokenize target
        score = meteor_score([target_tokens], pred_tokens)  # List of tokens for target
        meteor_scores.append(score)

    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    return avg_meteor

# Evaluate function to calculate BLEU, ROUGE, and METEOR
def evaluate_model(model, tokenizer, input_texts, target_texts):
    # Set pad_token to eos_token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model.eval()

    pred_texts = []

    # Generate predictions for each input
    for input_text in tqdm.tqdm(input_texts, desc="Evaluating"):
        inputs = tokenize_input(input_text, tokenizer)

        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Set pad_token_id (this is typically the EOS token for GPT models)
        outputs = model.generate(
            inputs['input_ids'],
            max_length=50,
            attention_mask=inputs['attention_mask'],  # Pass the attention mask
            pad_token_id=tokenizer.pad_token_id  # Use EOS token as pad token id
        )

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred_texts.append(pred_text)

    # Tokenize target and predicted texts for BLEU score
    tokenized_preds = [tokenize_for_bleu(text, tokenizer) for text in pred_texts]
    tokenized_targets = [tokenize_for_bleu(text, tokenizer) for text in target_texts]

    # Compute BLEU scores
    bleu_scores = [sentence_bleu([target], pred) for target, pred in zip(tokenized_targets, tokenized_preds)]

    # Compute ROUGE scores
    rouge1, rouge2, rougeL = compute_rouge(pred_texts, target_texts)

    # Compute METEOR scores
    meteor = compute_meteor(pred_texts, target_texts, tokenizer)

    return bleu_scores, rouge1, rouge2, rougeL, meteor

# Example usage with already loaded model and tokenizer
input_texts = df['question'].tolist()
target_texts = df['answer'].tolist()

# Get the evaluation metrics
bleu_scores, rouge1, rouge2, rougeL, meteor = evaluate_model(model, tokenizer, input_texts, target_texts)

# Output BLEU scores and average BLEU score
# print(f"BLEU Scores: {bleu_scores}")
average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
print(f"Average BLEU Score: {average_bleu}")

# Output ROUGE scores
print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-L: {rougeL}")

# Output METEOR score
print(f"METEOR Score: {meteor}")


Evaluating: 100%|██████████| 3725/3725 [00:54<00:00, 67.80it/s]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Average BLEU Score: 0.006196414756400701
ROUGE-1: 0.13920398060970202
ROUGE-2: 0.0331776341311386
ROUGE-L: 0.12943747566223723
METEOR Score: 0.07486839422145962


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
from torch.nn.functional import log_softmax
import numpy as np
import tqdm
import pandas as pd

# Assuming `df` is your DataFrame containing 'question' and 'answer' columns
# Example DataFrame (You should already have this)
# df = pd.read_csv("your_file.csv")  # Load your CSV if not already in DataFrame

# Load your custom model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize your GPT2 model and tokenizer
# model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Sentiment Analysis using Hugging Face's pipeline (for simplicity)
sentiment_pipeline = pipeline("sentiment-analysis", device=0 if torch.cuda.is_available() else -1)

# Tokenize inputs for evaluation
def tokenize_input(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)
    return inputs

# Function to calculate perplexity
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenize_input(text, tokenizer)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Compute logits and loss
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        log_likelihood = outputs.loss.item()

    # Perplexity is the exponentiation of the negative log-likelihood
    perplexity = np.exp(log_likelihood)
    return perplexity

# Function to calculate sentiment
def calculate_sentiment(text):
    result = sentiment_pipeline(text)
    return result[0]

# Example usage with the 'question' and 'answer' columns from your DataFrame
input_texts = df['question'].tolist()  # Replace with your actual DataFrame column
target_texts = df['answer'].tolist()   # Replace with your actual DataFrame column

# Initialize variables to store the cumulative values for averages
input_sentiment_scores = []
input_perplexities = []
target_sentiment_scores = []
target_perplexities = []

# Evaluate Sentiment and Perplexity for each text in input_texts with tqdm progress bar
for text in tqdm.tqdm(input_texts, desc="Evaluating Input Texts"):
    sentiment = calculate_sentiment(text)
    perplexity = calculate_perplexity(model, tokenizer, text)
    input_sentiment_scores.append(sentiment['score'])  # Assuming sentiment['score'] is the confidence value
    input_perplexities.append(perplexity)

# Evaluate Sentiment and Perplexity for each text in target_texts with tqdm progress bar
for text in tqdm.tqdm(target_texts, desc="Evaluating Target Texts"):
    sentiment = calculate_sentiment(text)
    perplexity = calculate_perplexity(model, tokenizer, text)
    target_sentiment_scores.append(sentiment['score'])  # Assuming sentiment['score'] is the confidence value
    target_perplexities.append(perplexity)

# Calculate average sentiment and perplexity scores
avg_input_sentiment = sum(input_sentiment_scores) / len(input_sentiment_scores) if input_sentiment_scores else 0
avg_input_perplexity = sum(input_perplexities) / len(input_perplexities) if input_perplexities else 0
avg_target_sentiment = sum(target_sentiment_scores) / len(target_sentiment_scores) if target_sentiment_scores else 0
avg_target_perplexity = sum(target_perplexities) / len(target_perplexities) if target_perplexities else 0

# Print the average results
print(f"\nAverage Sentiment Score for Input Texts: {avg_input_sentiment}")
print(f"Average Perplexity for Input Texts: {avg_input_perplexity}")

print(f"\nAverage Sentiment Score for Target Texts: {avg_target_sentiment}")
print(f"Average Perplexity for Target Texts: {avg_target_perplexity}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Evaluating Input Texts: 100%|██████████| 3725/3725 [01:00<00:00, 61.71it/s]
Evaluating Target Texts: 100%|██████████| 3725/3725 [00:58<00:00, 63.31it/s]


Average Sentiment Score for Input Texts: 0.9635209040673787
Average Perplexity for Input Texts: 90.44970905737047

Average Sentiment Score for Target Texts: 0.9635489976166078
Average Perplexity for Target Texts: 78.06074809598138





In [None]:
import torch
import tqdm
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer, util

# Ensure device compatibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
gpt2_model =model
gpt2_tokenizer = tokenizer

# Load Sentence Transformer model for semantic similarity
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Function to calculate semantic similarity using embeddings
def compute_semantic_similarity(pred_texts, target_texts):
    pred_embeddings = similarity_model.encode(pred_texts, convert_to_tensor=True)
    target_embeddings = similarity_model.encode(target_texts, convert_to_tensor=True)

    similarities = util.pytorch_cos_sim(pred_embeddings, target_embeddings).diagonal().tolist()
    return similarities  # List of similarity scores (0 to 1)

# Function to calculate perplexity
def compute_perplexity(texts):
    perplexities = []

    for text in tqdm.tqdm(texts, desc="Calculating Perplexity"):
        encodings = gpt2_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)

    return perplexities  # List of perplexity scores

# Example usage with dataset
input_texts = df['question'].tolist()
target_texts = df['answer'].tolist()

# Generate predictions (assuming predictions exist)
pred_texts = ["Generated response for: " + q for q in input_texts]  # Replace with actual model predictions

# Compute evaluations
similarity_scores = compute_semantic_similarity(pred_texts, target_texts)
perplexity_scores = compute_perplexity(pred_texts)

# Display final average scores
avg_similarity = np.mean(similarity_scores)
avg_perplexity = np.mean(perplexity_scores)

print(f"Average Semantic Similarity Score: {avg_similarity:.4f} ({avg_similarity * 100:.2f}%)")
print(f"Average Perplexity Score: {avg_perplexity:.4f}")


Calculating Perplexity: 100%|██████████| 3725/3725 [00:44<00:00, 84.05it/s]

Average Semantic Similarity Score: 0.2450 (24.50%)
Average Perplexity Score: 286.4858





In [None]:
# Generate predictions (assuming predictions exist)
pred_texts = [q for q in input_texts]  # Replace with actual model predictions

# Compute evaluations
similarity_scores = compute_semantic_similarity(pred_texts, target_texts)
perplexity_scores = compute_perplexity(pred_texts)

# Display final average scores
avg_similarity = np.mean(similarity_scores)
avg_perplexity = np.mean(perplexity_scores)

print(f"Average Semantic Similarity Score: {avg_similarity:.4f} ({avg_similarity * 100:.2f}%)")
print(f"Average Perplexity Score: {avg_perplexity:.4f}")

Calculating Perplexity: 100%|██████████| 3725/3725 [00:48<00:00, 76.90it/s]

Average Semantic Similarity Score: 0.3066 (30.66%)
Average Perplexity Score: 90.4497



