In [32]:
import pickle
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

In [33]:
def gen_t5_squad2(chunks):
    chunks_questions = {}

    model_name = "allenai/t5-small-squad2-question-generation"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    for chunk in chunks:
        input_ids = tokenizer.encode(chunk["text"], return_tensors="pt")
        res = model.generate(input_ids)
        generated_questions = tokenizer.batch_decode(res, skip_special_tokens=True)
        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

In [34]:
def gen_bart_discord(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained("Salesforce/discord_qg")
    qg_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/discord_qg")

    for chunk in chunks:
        encoder_ids = qg_tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = qg_tokenizer.batch_encode_plus(
            ['What'],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = qg_model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = qg_tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

In [35]:
def gen_bart_nq(chunks):
    chunks_questions = {}

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("McGill-NLP/bart-qg-nq-checkpoint")

    for chunk in chunks:
        inputs = tokenizer([chunk["text"]], return_tensors="pt")
        question_ids = model.generate(inputs["input_ids"])
        generated_questions = tokenizer.batch_decode(
            question_ids, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

In [36]:
def gen_bart_eqg(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained("voidful/bart-eqg-question-generator")
    qg_model = AutoModelForSeq2SeqLM.from_pretrained(
        "voidful/bart-eqg-question-generator"
    )

    for chunk in chunks:
        inputs = qg_tokenizer([chunk["text"]], return_tensors="pt")

        question_ids = qg_model.generate(inputs["input_ids"])
        generated_questions = qg_tokenizer.batch_decode(
            question_ids, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

In [37]:
def gen_bart_unknown(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained(
        "voidful/context-only-question-generator"
    )
    qg_model = AutoModelForSeq2SeqLM.from_pretrained(
        "voidful/context-only-question-generator"
    )

    for chunk in chunks:
        inputs = qg_tokenizer([chunk["text"]], return_tensors="pt")

        question_ids = qg_model.generate(inputs["input_ids"])
        generated_questions = qg_tokenizer.batch_decode(
            question_ids, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

In [38]:
questions_list = [
    "What is the main topic of the lecture?",
    "What is the speaker's intention?",
    "Can you estimate the level of detail the lecture will cover regarding natural language processing?",
    "What is the speaker asking the audience to do?",
    "What is the primary focus of the discussion?",
    "What does the speaker inquire about regarding the estimate?",
    "What is the connection between smartphones and embedded AIs?",
    "What are the various subsystems mentioned that utilize AI in smartphones?",
    "Why is language considered a key element in interaction?",
    "What distinction is made between natural languages and formal languages like Python or C++?",
    "What fields contribute to natural language processing at its core?",
    "What are the two main subfields mentioned within NLP?",
    "What role does natural language understanding play in NLP?",
    "What does the speaker present as the motivation for natural language understanding?",
    "What types of insights can be gained from analyzing large amounts of text data?",
    "What are some common applications of natural language generation systems?",
    "What does the speaker present as the evolution of NLP systems over time?",
    "What are the main types of early NLP systems mentioned?",
    "What role did classical machine learning algorithms play in the development of NLP?",
    "What are the advancements mentioned that characterize the current state of NLP research?",
    "What are some common subfields of NLP?",
    "What is the significance of considering documents and corpora in the context of NLP?",
    "What are the primary steps in the NLP pre-processing pipeline?",
    "How does annotation contribute to the enrichment of tokens?",
    "What is the distinction between lemmatization and stemming in terms of processing words?",
    "Why is filtering important in the NLP pre-processing pipeline?",
    "What are n-grams, and why might they be valuable in NLP?",
    "What paper is recommended for reading?",
    "How does spelling correction in Microsoft Word relate to NLP concepts?",
    "What is the difference between lemmatization and stemming?",
    "How does the time-saving aspect influence the choice between lemmatization and stemming?",
    "How does filtering contribute to the efficiency of NLP models?",
    "How is the process of data analysis influenced by the need to save time in NLP?",
    "What is the role of dictionaries in lemmatization?",
    "Where can students find information about upcoming lab sessions and coursework on Moodle?",
]

def gen_gpt3_5(chunks):
    chunks_questions = {}

    for i, chunk in enumerate(chunks):
        chunks_questions[chunk["text"]] = questions_list[i]

    return chunks_questions

In [39]:
with open("comp3074_lecture_2.pkl", "rb") as file:
    chunks = pickle.load(file)
    time_chunks = []
    current_sentence = ""
    start_timestamp = None
    min_duration = 60

    if not chunks["chunks"][-1]["timestamp"][1]:
        chunks["chunks"][-1]["timestamp"] = (
            chunks["chunks"][-1]["timestamp"][0],
            chunks["chunks"][-1]["timestamp"][0],
        )

    for chunk in chunks["chunks"]:
        text = chunk["text"]
        timestamp = chunk["timestamp"]

        if start_timestamp is None:
            # Start a new sentence
            start_timestamp = timestamp[0]

        current_sentence += text

        sentence_completed = text.strip()[-1] in ".!?" 
        time_elapsed = (timestamp[1] - start_timestamp) >= min_duration

        # TODO: tokenize to ensure below token limit for qg model
        # TODO: add overlap between chunks
        if sentence_completed and time_elapsed:
            time_chunks.append(
                {
                    "timestamp": (start_timestamp, timestamp[1]),
                    "text": current_sentence.strip(),
                }
            )
            current_sentence = ""
            start_timestamp = None

    if current_sentence.strip():
        time_chunks.append(
            {
                "timestamp": (start_timestamp, chunks["chunks"][-1]["timestamp"][1]),
                "text": current_sentence.strip(),
            }
        )

time_chunks

[{'timestamp': (0.0, 66.0),
  'text': "So in this lecture we're going to still be quite general. I'm going to talk about natural language processing and I'm going to talk in the first part I'm going to talk about what it is and in the second part I am going to oh good point in a second part I am going to talk about to introduce the concept of an NLP pipeline which is something that we'll be dealing with in the lab a little bit tomorrow and then from next Friday onwards. So first a quick overview of natural English processing. This is a topic which is usually reserved for like graduate courses in themselves so we're only even scrape really the surface of it. First a quick warm-up question and just take a minute and think can someone give me a quick estimate of how many AIs are in this room? Anyone throws out a guess?"},
 {'timestamp': (66.0, 128.0),
  'text': "I guess. Do you want individual phones or do you want like the types of hands? Well, that's the question, isn't it? Just as I ha

In [40]:
models_questions = {
    "bart_unknown": gen_bart_unknown(time_chunks),
    "t5_squad2": gen_t5_squad2(time_chunks),
    "bart_discord": gen_bart_discord(time_chunks),
    "bart_nq": gen_bart_nq(time_chunks),
    "bart_eqg": gen_bart_eqg(time_chunks),
    "gpt3": gen_gpt3_5(time_chunks),
}

chunks_by_model = {}

for model, chunk_questions in models_questions.items():
    for chunk, question in chunk_questions.items():
        if chunk in chunks_by_model:
            chunks_by_model[chunk].update({model: question})
        else:
            chunks_by_model[chunk] = {model: question}

chunks_by_model



{"So in this lecture we're going to still be quite general. I'm going to talk about natural language processing and I'm going to talk in the first part I'm going to talk about what it is and in the second part I am going to oh good point in a second part I am going to talk about to introduce the concept of an NLP pipeline which is something that we'll be dealing with in the lab a little bit tomorrow and then from next Friday onwards. So first a quick overview of natural English processing. This is a topic which is usually reserved for like graduate courses in themselves so we're only even scrape really the surface of it. First a quick warm-up question and just take a minute and think can someone give me a quick estimate of how many AIs are in this room? Anyone throws out a guess?": {'bart_unknown': 'How many AIs are in the room?',
  't5_squad2': 'What is the topic that is usually reserved for like graduate courses?',
  'bart_discord': 'What is natural language processing?',
  'bart_nq'

In [41]:
def evaluate_contexts_all_metrics(chunks):
    bertscore = evaluate.load("bertscore")
    output = {}

    for context in chunks:
        references = [chunks[context]["gpt3"]]
        models = chunks[context]
        output[context] = {}

        for model in models:
            if model != "gpt3":
                predictions = [models[model]]
                results = bertscore.compute(
                    references=references, 
                    predictions=predictions, 
                    model_type="microsoft/deberta-xlarge-mnli"
                )
                output[context][model] = results
                

    return output


eval_scores = evaluate_contexts_all_metrics(chunks_by_model)
eval_scores

{"So in this lecture we're going to still be quite general. I'm going to talk about natural language processing and I'm going to talk in the first part I'm going to talk about what it is and in the second part I am going to oh good point in a second part I am going to talk about to introduce the concept of an NLP pipeline which is something that we'll be dealing with in the lab a little bit tomorrow and then from next Friday onwards. So first a quick overview of natural English processing. This is a topic which is usually reserved for like graduate courses in themselves so we're only even scrape really the surface of it. First a quick warm-up question and just take a minute and think can someone give me a quick estimate of how many AIs are in this room? Anyone throws out a guess?": {'bart_unknown': {'precision': [0.6726589202880859],
   'recall': [0.6717042922973633],
   'f1': [0.672181248664856],
   'hashcode': 'microsoft/deberta-xlarge-mnli_L40_no-idf_version=0.3.12(hug_trans=4.35.2)

In [42]:
from collections import defaultdict
import statistics

def calculate_avg_std_evaluation(chunks):
    precision_by_model = defaultdict(list)
    f1_by_model = defaultdict(list)
    recall_by_model = defaultdict(list)

    for context in chunks:
        models = chunks[context]
        for model in models:
            metrics = models[model]
            precision_by_model[model].append(metrics['precision'][0])
            f1_by_model[model].append(metrics['f1'][0])
            recall_by_model[model].append(metrics['recall'][0])
    
    stats_by_model = defaultdict(dict)
    for model in precision_by_model:
        precision_avg = sum(precision_by_model[model]) / len(precision_by_model[model])
        precision_std = statistics.stdev(precision_by_model[model])
        recall_avg = sum(recall_by_model[model]) / len(recall_by_model[model])
        recall_std = statistics.stdev(recall_by_model[model])
        f1_avg = sum(f1_by_model[model]) / len(f1_by_model[model])
        f1_std = statistics.stdev(f1_by_model[model])

        stats_by_model[model] = {
            'avg_precision': precision_avg,
            'std_precision': precision_std,
            'avg_recall': recall_avg,
            'std_recall': recall_std,
            'avg_f1': f1_avg,
            'std_f1': f1_std
        }
        
    return stats_by_model

average_eval_scores = calculate_avg_std_evaluation(eval_scores)
average_eval_scores

defaultdict(dict,
            {'bart_unknown': {'avg_precision': 0.5910171142646244,
              'std_precision': 0.09137459813844907,
              'avg_recall': 0.6055950360638754,
              'std_recall': 0.05155032826576651,
              'avg_f1': 0.5959542640617915,
              'std_f1': 0.06651286231223734},
             't5_squad2': {'avg_precision': 0.6487540134361812,
              'std_precision': 0.07849524324425095,
              'avg_recall': 0.6351044586726597,
              'std_recall': 0.0606487204462402,
              'avg_f1': 0.6397578375680106,
              'std_f1': 0.05948630015059276},
             'bart_discord': {'avg_precision': 0.6681201645306178,
              'std_precision': 0.0547331236798575,
              'avg_recall': 0.630831411906651,
              'std_recall': 0.049025394193604174,
              'avg_f1': 0.6472644856997899,
              'std_f1': 0.03902768149467393},
             'bart_nq': {'avg_precision': 0.6185752289635794,
       

In [44]:
sorted_models = sorted(average_eval_scores.items(), key=lambda x: x[1]['avg_f1'], reverse=True)

for model, stats in sorted_models:
    print(f"Model: {model}")
    print(f"F1 Score - Average and standard deviation: ({stats['avg_f1']:.3f} ± {stats['std_f1']:.3f})")
    print(f"Precision - Average and standard deviation: ({stats['avg_precision']:.3f} ± {stats['std_precision']:.3f})")
    print(f"Recall - Average and standard deviation: ({stats['avg_recall']:.3f} ± {stats['std_recall']:.3f})")
    print("-" * 60)

Model: bart_discord
F1 Score - Average and standard deviation: (0.647 ± 0.039)
Precision - Average and standard deviation: (0.668 ± 0.055)
Recall - Average and standard deviation: (0.631 ± 0.049)
------------------------------------------------------------
Model: bart_eqg
F1 Score - Average and standard deviation: (0.647 ± 0.041)
Precision - Average and standard deviation: (0.657 ± 0.051)
Recall - Average and standard deviation: (0.640 ± 0.055)
------------------------------------------------------------
Model: t5_squad2
F1 Score - Average and standard deviation: (0.640 ± 0.059)
Precision - Average and standard deviation: (0.649 ± 0.078)
Recall - Average and standard deviation: (0.635 ± 0.061)
------------------------------------------------------------
Model: bart_nq
F1 Score - Average and standard deviation: (0.617 ± 0.059)
Precision - Average and standard deviation: (0.619 ± 0.072)
Recall - Average and standard deviation: (0.617 ± 0.054)
---------------------------------------------