In [1]:
#!pip install transformers==4.41.2

In [2]:
import transformers
print(transformers.__version__)

4.41.2


In [3]:
import os
import json
import time
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [4]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b-32k", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm2-6b-32k", trust_remote_code=True, torch_dtype=torch.float16).to(device)
model = model.eval()
# Monkey-patch the tokenizer's _pad method to handle the padding_side argument
original_pad = tokenizer._pad
def new_pad(self, encoded_inputs, max_length=None, padding_strategy="longest", pad_to_multiple_of=None, return_attention_mask=None, **kwargs):
    return original_pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask)
tokenizer._pad = new_pad.__get__(tokenizer, type(tokenizer))

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
import re
import string
from collections import Counter
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    common = Counter(prediction) & Counter(ground_truth)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction)
    recall = 1.0 * num_same / len(ground_truth)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def qa_f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    return f1_score(prediction_tokens, ground_truth_tokens)

In [6]:
def evaluate_multifield(df):
  metrics = {'_id':[], 'f1_scores':[], 'latencies':[], 'memory_usages':[], 'peak_memory_usages':[], 'responses':[]}

  for index, row in df.iterrows():
    print(f"Processing {index}")
    try:
      metrics["_id"].append(row["_id"])
      prompt = "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:".format(context=row['summary'], input=row['input'])
      torch.cuda.reset_peak_memory_stats()
      start_time = time.time()
      response, _ = model.chat(tokenizer, prompt)
      metrics["responses"].append(response)
      total_time = time.time() - start_time
      print(f"Time taken {total_time}")

      metrics["latencies"].append(total_time)

      # Measure memory
      memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB

      #Measure peak memory
      peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)
      metrics["memory_usages"].append(memory)
      metrics["peak_memory_usages"].append(peak_memory)
      metrics["f1_scores"].append(qa_f1_score(response, row['answers'][0]))

    except Exception as e:
      print("Failed with error: ", e)
      metrics["_id"].append(row["_id"])
      metrics["responses"].append(None)
      metrics["latencies"].append(None)
      metrics["memory_usages"].append(None)
      metrics["peak_memory_usages"].append(None)
      metrics["f1_scores"].append(None)
  return metrics


In [7]:
def evaluate_hotpot(df):
  metrics = {'_id':[], 'f1_scores':[], 'latencies':[], 'memory_usages':[], 'peak_memory_usages':[], 'responses':[]}

  for index, row in df.iterrows():
    print(f"Processing {index}")
    try:
      metrics["_id"].append(row["_id"])
      prompt = "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:".format(context=row['summary'], input=row['input'])
      torch.cuda.reset_peak_memory_stats()
      start_time = time.time()
      response, _ = model.chat(tokenizer, prompt)
      metrics["responses"].append(response)
      total_time = time.time() - start_time
      print(f"Time taken {total_time}")

      metrics["latencies"].append(total_time)

      # Measure memory
      memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB

      #Measure peak memory
      peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)
      metrics["memory_usages"].append(memory)
      metrics["peak_memory_usages"].append(peak_memory)
      metrics["f1_scores"].append(qa_f1_score(response, row['answers'][0]))

    except Exception as e:
      print("Failed with error: ", e)
      metrics["_id"].append(row["_id"])
      metrics["responses"].append(None)
      metrics["latencies"].append(None)
      metrics["memory_usages"].append(None)
      metrics["peak_memory_usages"].append(None)
      metrics["f1_scores"].append(None)
  return metrics


In [8]:
def evaluate_trivia(df):
  metrics = {'_id':[], 'f1_scores':[], 'latencies':[], 'memory_usages':[], 'peak_memory_usages':[], 'responses':[]}

  for index, row in df.iterrows():
    print(f"Processing {index}")
    try:
      metrics["_id"].append(row["_id"])
      prompt = "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}".format(context=row['summary'], input=row['input'])
      torch.cuda.reset_peak_memory_stats()
      start_time = time.time()
      response, _ = model.chat(tokenizer, prompt)
      metrics["responses"].append(response)
      total_time = time.time() - start_time
      print(f"Time taken {total_time}")

      metrics["latencies"].append(total_time)

      # Measure memory
      memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB

      #Measure peak memory
      peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)
      metrics["memory_usages"].append(memory)
      metrics["peak_memory_usages"].append(peak_memory)
      metrics["f1_scores"].append(qa_f1_score(response, row['answers'][0]))

    except Exception as e:
      print("Failed with error: ", e)
      metrics["_id"].append(row["_id"])
      metrics["responses"].append(None)
      metrics["latencies"].append(None)
      metrics["memory_usages"].append(None)
      metrics["peak_memory_usages"].append(None)
      metrics["f1_scores"].append(None)
  return metrics


In [9]:
def evaluate_narrative(df):
  metrics = {'_id':[], 'f1_scores':[], 'latencies':[], 'memory_usages':[], 'peak_memory_usages':[], 'responses':[]}

  for index, row in df.iterrows():
    print(f"Processing {index}")
    try:
      metrics["_id"].append(row["_id"])
      prompt = "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:".format(context=row['summary'], input=row['input'])
      torch.cuda.reset_peak_memory_stats()
      start_time = time.time()
      response, _ = model.chat(tokenizer, prompt)
      metrics["responses"].append(response)
      total_time = time.time() - start_time
      print(f"Time taken {total_time}")

      metrics["latencies"].append(total_time)

      # Measure memory
      memory = torch.cuda.memory_allocated() / (1024 ** 2)  # MB

      #Measure peak memory
      peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)
      metrics["memory_usages"].append(memory)
      metrics["peak_memory_usages"].append(peak_memory)
      metrics["f1_scores"].append(qa_f1_score(response, row['answers'][0]))

    except Exception as e:
      print("Failed with error: ", e)
      metrics["_id"].append(row["_id"])
      metrics["responses"].append(None)
      metrics["latencies"].append(None)
      metrics["memory_usages"].append(None)
      metrics["peak_memory_usages"].append(None)
      metrics["f1_scores"].append(None)
  return metrics


In [10]:
dataset_name_list = ['multifieldqa', 'hotpotqa', 'triviaqa', 'narrativeqa']

In [11]:
import pandas as pd
import json
import torch
import time

ratios = [0.25, 0.5, 0.75]
# dataset_name_list = ['hotpotqa', 'narrative_qa', 'triviaqa', 'multifieldqa']
dataset_name = dataset_name_list[0]  #multifieldqa
for i in ratios:
    try:
      df = pd.read_csv(f'{dataset_name}/summarize_{dataset_name}_{int(i * 100)}.csv')
      metrics = evaluate_multifield(df)
      with open(f'{dataset_name}_metrics_{int(i * 100)}.json', 'w') as f:
        json.dump(metrics, f)
    except Exception as e:
        print("Failed with error: ", e)

Processing 0
Time taken 1.4531891345977783
Processing 1
Time taken 0.8736612796783447
Processing 2
Time taken 1.2118747234344482
Processing 3
Time taken 1.1313531398773193
Processing 4
Time taken 1.3239362239837646
Processing 5
Time taken 1.1014785766601562
Processing 6
Time taken 0.5796425342559814
Processing 7
Time taken 0.18744754791259766
Processing 8
Time taken 0.29257988929748535
Processing 9
Time taken 0.7246441841125488
Processing 10
Time taken 1.2328569889068604
Processing 11
Time taken 0.4434385299682617
Processing 12
Time taken 0.6720201969146729
Processing 13
Time taken 0.633056640625
Processing 14
Time taken 0.8250296115875244
Processing 15
Time taken 0.24315404891967773
Processing 16
Time taken 0.38458871841430664
Processing 17
Time taken 1.0456411838531494
Processing 18
Time taken 0.5807092189788818
Processing 19
Time taken 0.2324965000152588
Processing 20
Time taken 0.2625463008880615
Processing 21
Time taken 0.47435498237609863
Processing 22
Time taken 0.95555853843688

In [12]:
import pandas as pd
import json
import torch
import time

ratios = [0.25, 0.5, 0.75]
# dataset_name_list = ['hotpotqa', 'narrative_qa', 'triviaqa', 'multifieldqa']
dataset_name = dataset_name_list[1]  #hotpotqa
for i in ratios:
    try:
      df = pd.read_csv(f'{dataset_name}/summarize_{dataset_name}_{int(i * 100)}.csv')
      metrics = evaluate_hotpot(df)
      with open(f'{dataset_name}_metrics_{int(i * 100)}.json', 'w') as f:
        json.dump(metrics, f)
    except Exception as e:
        print("Failed with error: ", e)

Processing 0
Time taken 0.6511623859405518
Processing 1
Time taken 0.8640518188476562
Processing 2
Time taken 0.8670048713684082
Processing 3
Time taken 0.8105552196502686
Processing 4
Time taken 0.7205681800842285
Processing 5
Time taken 0.7838115692138672
Processing 6
Time taken 0.7894818782806396
Processing 7
Time taken 0.2238001823425293
Processing 8
Time taken 0.6705532073974609
Processing 9
Time taken 0.5024318695068359
Processing 10
Time taken 1.04695463180542
Processing 11
Time taken 0.818793773651123
Processing 12
Time taken 0.8221344947814941
Processing 13
Time taken 0.47844386100769043
Processing 14
Time taken 0.9917526245117188
Processing 15
Time taken 0.6893248558044434
Processing 16
Time taken 0.4423379898071289
Processing 17
Time taken 0.7803754806518555
Processing 18
Time taken 0.8340470790863037
Processing 19
Time taken 0.7686879634857178
Processing 20
Time taken 0.5914397239685059
Processing 21
Time taken 0.7937800884246826
Processing 22
Time taken 0.2827930450439453


In [13]:
import pandas as pd
import json
import torch
import time

ratios = [0.25, 0.5, 0.75]
# dataset_name_list = ['hotpotqa', 'narrative_qa', 'triviaqa', 'multifieldqa']
dataset_name = dataset_name_list[2]  #triviaqa
for i in ratios:
    try:
      df = pd.read_csv(f'{dataset_name}/summarize_{dataset_name}_{int(i * 100)}.csv')
      metrics = evaluate_trivia(df)
      with open(f'{dataset_name}_metrics_{int(i * 100)}.json', 'w') as f:
        json.dump(metrics, f)
    except Exception as e:
        print("Failed with error: ", e)

Processing 0
Time taken 0.2795112133026123
Processing 1
Time taken 1.047398328781128
Processing 2
Time taken 0.6987898349761963
Processing 3
Time taken 1.1434462070465088
Processing 4
Time taken 0.7824344635009766
Processing 5
Time taken 0.7599151134490967
Processing 6
Time taken 0.5941810607910156
Processing 7
Time taken 0.30580615997314453
Processing 8
Time taken 0.4257078170776367
Processing 9
Time taken 0.6409378051757812
Processing 10
Time taken 1.0865490436553955
Processing 11
Time taken 0.6006135940551758
Processing 12
Time taken 0.3444385528564453
Processing 13
Time taken 0.8535211086273193
Processing 14
Time taken 0.3938422203063965
Processing 15
Time taken 0.46035265922546387
Processing 16
Time taken 0.898949384689331
Processing 17
Time taken 0.22241425514221191
Processing 18
Time taken 0.8796632289886475
Processing 19
Time taken 0.8445568084716797
Processing 20
Time taken 0.8147344589233398
Processing 21
Time taken 0.4385213851928711
Processing 22
Time taken 0.62356996536254

In [14]:
import pandas as pd
import json
import torch
import time

ratios = [0.25, 0.5, 0.75]
# dataset_name_list = ['hotpotqa', 'narrative_qa', 'triviaqa', 'multifieldqa']
dataset_name = dataset_name_list[3]  #narrative_qa
for i in ratios:
    try:
      df = pd.read_csv(f'{dataset_name}/summarize_{dataset_name}_{int(i * 100)}.csv')
      metrics = evaluate_narrative(df)
      with open(f'{dataset_name}_metrics_{int(i * 100)}.json', 'w') as f:
        json.dump(metrics, f)
    except Exception as e:
        print("Failed with error: ", e)

Processing 0
Time taken 1.6991956233978271
Processing 1
Time taken 1.9323482513427734
Processing 2
Time taken 0.5165257453918457
Processing 3
Time taken 0.6742467880249023
Processing 4
Time taken 2.332346200942993
Processing 5
Time taken 0.7260270118713379
Processing 6
Time taken 1.5438194274902344
Processing 7
Time taken 1.9648559093475342
Processing 8
Time taken 1.3404452800750732
Processing 9
Time taken 1.0115773677825928
Processing 10
Time taken 2.529669761657715
Processing 11
Time taken 1.0242137908935547
Processing 12
Time taken 5.895442008972168
Processing 13
Time taken 0.5545692443847656
Processing 14
Time taken 0.6025290489196777
Processing 15
Time taken 2.117722272872925
Processing 16
Time taken 1.231825590133667
Processing 17
Time taken 2.6803884506225586
Processing 18
Time taken 1.687713623046875
Processing 19
Time taken 1.7755730152130127
Processing 20
Time taken 2.7800498008728027
Processing 21
Time taken 1.9235246181488037
Processing 22
Time taken 2.991563081741333
Proce