## CPSC 477 Final Project Part 3: Evaluation

In [1]:
import os
import evaluate
from util import *
rouge = evaluate.load("rouge")

def extract_statistic(s, ignore_single_digit = False):
    # The string should not be empty
    s = s.replace(",", "") # Remove commas
    if len(s) == 0:
        return None
    # It could be a valid number that is not a recent year (i.e 2015-2025)
    if ignore_single_digit and is_float(s) and float(s) < 10:
        return None
    if is_float(s) and (float(s) < 2015 or float(s) > 2025):
        return (float(s), None)
    # In the form of $[valid number]
    if s[0] == "$" and is_float(s[1:]):
        return (float(s[1:]), "$")
    # Or in the form of [valid number]%
    if s[-1] == "%" and is_float(s[:-1]):
        return (float(s[:-1]), "%")
    return None

def extract_numbers(text, ignore_single_digit = False):
    words = [s for line in text.split("\n") for s in line.split(" ")]
    words = list(filter(len, words))
    words = list(map(lambda s: s[:-1] if s[-1] in ["."] else s, words))
    numbers = set(map(lambda w: extract_statistic(w, ignore_single_digit), words))
    numbers.remove(None)
    return list(numbers)

def extract_key_points(ect, summary, ignore_single_digit = False):
    ect_numbers = []
    if len(ect) > max_ect_length:
        front = ect[:max_ect_length // 2]
        back = ect[-max_ect_length // 2:]
        ect_numbers = extract_numbers(f"{front}\n{back}", ignore_single_digit)
    else:
        ect_numbers = extract_numbers(ect, ignore_single_digit)
    key_points = ""
    key_points_numbers = []
    for line in summary.split("\n"):
        line_numbers = extract_numbers(line, ignore_single_digit)
        ect_precision = 0
        for n in line_numbers:
            if n in ect_numbers:
                ect_precision += 1
        # print(len(ect), ect_precision, len(line_numbers))
        if ect_precision > 0 and ect_precision == len(line_numbers):
            key_points += line + "\n"
            key_points_numbers.extend(line_numbers)
    return (key_points, list(set(key_points_numbers)))

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Obtained base model inference on test dataset 
model = "base_mistral_final"

test_filenames = os.listdir(f"inference/{model}")
if ".ipynb_checkpoints" in test_filenames:
    test_filenames.remove(".ipynb_checkpoints")

candidates = []
references = []
ects = []
for filename in test_filenames:
    base_inference = ""
    gemini_summary = ""
    ect = ""
    with open(f"inference/{model}/{filename}") as f:
        base_inference = f.read()
    with open(f"dataset/test/gemini_summaries/{filename}") as f:
        gemini_summary = f.read()
    with open(f"dataset/test/ects/{filename}") as f:
        ect = f.read()
    candidates.append(base_inference)
    references.append(gemini_summary)
    ects.append(ect)

print(len(candidates), len(references))

candidate_lengths = list(map(len, candidates))
print(sum(candidate_lengths) / len(candidate_lengths))

reference_lengths = list(map(len, references))
print(sum(reference_lengths) / len(reference_lengths))

50 50
1180.48
934.38


In [84]:
results = rouge.compute(predictions=candidates, references=references)
print(f"baseline: {results}")

baseline: {'rouge1': 0.44835118142896613, 'rouge2': 0.17846536649122435, 'rougeL': 0.26059977608446355, 'rougeLsum': 0.28815962106935133}


In [4]:
# Obtained base model inference on test dataset 
# Took 20 min on V100 (Google Colab)

model = "finetuned_mistral_final"

test_filenames = os.listdir(f"inference/{model}")
if ".ipynb_checkpoints" in test_filenames:
    test_filenames.remove(".ipynb_checkpoints")

candidates = []
references = []
ects = []
for filename in test_filenames:
    base_inference = ""
    gemini_summary = ""
    ect = ""
    with open(f"inference/{model}/{filename}") as f:
        base_inference = f.read()
    with open(f"dataset/test/gemini_summaries/{filename}") as f:
        gemini_summary = f.read()
    with open(f"dataset/test/ects/{filename}") as f:
        ect = f.read()
    candidates.append(base_inference)
    references.append(gemini_summary)
    ects.append(ect)

print(len(candidates), len(references))

candidate_lengths = list(map(len, candidates))
print(sum(candidate_lengths) / len(candidate_lengths))

reference_lengths = list(map(len, references))
print(sum(reference_lengths) / len(reference_lengths))

50 50
1167.94
934.38


In [5]:
results = rouge.compute(predictions=candidates, references=references)
print(f"fine-tuned: {results}")

fine-tuned: {'rouge1': 0.462058842316896, 'rouge2': 0.19530156234345203, 'rougeL': 0.2730591893592447, 'rougeLsum': 0.29365523442752167}


In [8]:
gemini_recall = [0, 0]
ect_recall = [0, 0]
precision = [0, 0]
transcript_recall = [0, 0]
for i, filename in enumerate(test_filenames):
    ect_numbers = extract_numbers(ects[i], True)
    candidate_numbers = extract_numbers(candidates[i], True)
    reference_numbers = extract_numbers(references[i],  True)
    _, ectsum_numbers = extract_key_points(ects[i], references[i], True)
    # Recall
    recall = 0
    for r in reference_numbers:
        if r in candidate_numbers:
            recall += 1
    gemini_recall[0] += recall
    gemini_recall[1] += len(reference_numbers)

    recall = 0
    for r in ect_numbers:
        if r in candidate_numbers:
            recall += 1
    ect_recall[0] += recall
    ect_recall[1] += len(ect_numbers)
 
    # Precision
    _precision = 0
    for c in candidate_numbers:
        if c in ect_numbers:
            _precision += 1
    precision[0] += _precision
    precision[1] += len(candidate_numbers)

print(model)
print("Gemini recall:", gemini_recall, gemini_recall[0] / gemini_recall[1])
print("ECT recall", ect_recall, ect_recall[0] / ect_recall[1])
print("Precision:", precision, precision[0] / precision[1])

base_1
Gemini recall: [162, 380] 0.4263157894736842
ECT recall [387, 1958] 0.1976506639427988
Precision: [387, 394] 0.9822335025380711


In [None]:
# Generate summaries with GPT-3.5
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

prompt = "Take this scraped input for a clothing item and list the following in the following format. For fields that you do not find, write N/A.\nFormat:\nMaterial: [X]% [Cotton or Organic Cotton or Polyester or Lyocell or Elastane or Polyamide or Elastomultiester], [X]% [Cotton or Organic Cotton or Polyester or Lyocell or Elastane or Polyamide or Elastomultiester or Other]…\nRecycled Material: [X]%\nCountry of Origin: [United States or Laos or Vietnam…]\nCompany: [SHEIN or Amazon or GAP…]\nClothing Item: [string]"

response = client.chat.completions.create(
    model = "gpt-3.5-turbo",
    messages = [
        {"role": "system", "content": "You are a financial advisor tasked with creating a short summary of an earnings call transcript. You only want to summarize or re-iterate points that would be relevant, critical, or informational to someone who wants to skim over the important details of a long transcript."},
        {"role": "user", "Below is an earnings call transcript. Please summarize this transcript in exactly one paragraph using complete sentences. Keep the summary below 300 words. It is very important that you do not use any titles in the summary. Include relevant information and statistics from the Earnings Call Transcript in your summary.\n\nEarnings Call Transcript:\n{}"},
    ]
)
print(response)