In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from config import Config_eval
import pandas as pd
from transformers import pipeline
from cloze_util import process_cloze_questions
from eval_utils import eval_rouge_recall, eval_cosine_similarity, get_probs

  from .autonotebook import tqdm as notebook_tqdm


Token ids for Llama 3.1

- <|begin_of_text|> - 128000
- <|eot_id|> - 128009
- <|finetune_right_pad_id|> - 128004
- <|start_header_id|> - 128006
- <|end_header_id|> - 128007

In [18]:
LLAMA3_CHAT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [19]:
cfg = Config_eval()

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [21]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

In [22]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", torch_dtype=torch.bfloat16, device_map = device)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


### mcq eval

In [None]:
mcq_data = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/mcq_data.csv')

In [8]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, token = cfg.access_token)
model = AutoModelForCausalLM.from_pretrained(cfg.model_id, 
                                             device_map = 'auto',
                                             torch_dtype = torch.bfloat16, 
                                             token=cfg.access_token,)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it]


In [31]:
options = mcq_data['mcq_raw_options'][12]
choices = ''.join(options).replace('[','').replace(']','').replace('\'','').replace(',','')
questions = mcq_data['mcq_question'][12]
instruction = f"Choose the correct answer from the options below. Answer with the letter of the correct choice. Question:{questions} Options:{choices} "
prompt = LLAMA3_CHAT_TEMPLATE.format(instruction=instruction)

In [32]:
choices

'A) Golden Globe Award B) BAFTA Award C) AFI Life Achievement Award D) Grammy Award'

In [33]:
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nChoose the correct answer from the options below. Answer with the letter of the correct choice. Question:Which prestigious award has Robert De Niro NOT received? Options:A) Golden Globe Award B) BAFTA Award C) AFI Life Achievement Award D) Grammy Award <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [34]:
inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(**inputs, max_new_tokens=5)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'system\n\nYou are a helpful assistantuser\n\nChoose the correct answer from the options below. Answer with the letter of the correct choice. Question:Which prestigious award has Robert De Niro NOT received? Options:A) Golden Globe Award B) BAFTA Award C) AFI Life Achievement Award D) Grammy Award assistant\n\nThe correct answer is:\n\n'

In [8]:
from transformers import pipeline


pipe = pipeline('text-generation', model=cfg.model_id, device_map = "auto",
                    model_kwargs={"torch_dtype": torch.bfloat16})


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
Device set to use cuda:0


In [23]:
options = mcq_data['mcq_raw_options'][12]
choices = ''.join(options).replace('[','').replace(']','').replace('\'','').replace(',','')
questions = mcq_data['mcq_question'][12]

In [24]:
instruct = f"Choose the correct answer from the options below. Answer with a single letter of the correct choice either A, B, C or D. Question:{questions} Options:{choices}"
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": instruct},
]

In [25]:
outputs = pipe(messages, max_new_tokens=2)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [32]:
print(outputs[0]['generated_text'][-1]['content'])

D


In [31]:
ans = outputs[0]['generated_text'][-1]['content']
ans

'D'

### eval on mcqs

In [18]:
dob_questions = mcq_data.loc[mcq_data['standardized_section'] == 'Basic Info']
retain_questions = mcq_data.loc[mcq_data['standardized_section'] != 'Basic Info']

In [19]:
dob_questions['ul_answers'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dob_questions['ul_answers'] = ''


In [35]:
for i,row in dob_questions.iterrows():
    options = row['mcq_raw_options']
    choices = ''.join(options).replace('[','').replace(']','').replace('\'','').replace(',','')
    questions = row['mcq_question']
    instruct = f"Choose the correct answer from the options below. Answer with a single letter of the correct choice either A, B, C or D. Question:{questions} Options:{choices}"
    messages = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": instruct},
    ]
    outputs = pipe(messages, max_new_tokens=1)
    dob_questions.at[i,'ul_answers'] = outputs[0]['generated_text'][-1]['content']


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [40]:
accuracy = round((dob_questions['ul_answers'] == 'D').mean()*100, 2)
accuracy

np.float64(43.56)

### cloze test eval

In [None]:
cloze = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/cloze_data.csv')
cloze_2 = cloze[:10]

In [11]:
pipe = pipeline('text-generation', model=cfg.model_id, device_map = "auto",
                    model_kwargs={"torch_dtype": torch.bfloat16})

Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.06s/it]
Device set to use cuda:0


In [12]:
cloze_ans = process_cloze_questions(cloze_2, pipe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cloze_answers'] = ''
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [13]:
import pandas as pd
import re
from difflib import SequenceMatcher
from nltk.translate.bleu_score import sentence_bleu

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id)

cloze['tokenized_answer'] = cloze['answer'].apply(lambda x: len(tokenizer(x)['input_ids']))
cloze['tokenized_answer'].describe()

max_t = cloze.loc[cloze['tokenized_answer'] >= 17]
max_t.shape

In [14]:
questions = cloze['question'][0]
answer = cloze['answer'][0]
instruct = f"Please fill in the blanks with the correct answers. Only provide the answer, do not write any explanation. Question:{questions}"
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": instruct},
]

In [15]:
outputs = pipe(messages, max_new_tokens=17)
outputs

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': [{'role': 'system',
    'content': 'You are a helpful assistant'},
   {'role': 'user',
    'content': 'Please fill in the blanks with the correct answers. Only provide the answer, do not write any explanation. Question:Robert De Niro was born on ___ in New York City, New York.'},
   {'role': 'assistant', 'content': 'August 17, 1943'}]}]

In [16]:
pred_answers = outputs[0]['generated_text'][-1]['content']  
pred_answers

'August 17, 1943'

In [27]:
question = ["August 17, 1943"]
answer = [["August 17, 1943", "1943, August 17", "17th August 1943" ]]
pred_answer = ["August 17"]

In [None]:
from evaluate import load
sari = load("sari")
sari_score = sari.compute(sources=question, predictions=pred_answer, references=answer)
print(sari_score)