In [169]:
import pickle
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

## Models

### [t5-small-squad2-question-generation](https://huggingface.co/allenai/t5-small-squad2-question-generation)

In [170]:
def gen_t5_squad2(chunks):
    chunks_questions = {}

    model_name = "allenai/t5-small-squad2-question-generation"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    for chunk in chunks:
        encoder_ids = tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = tokenizer.batch_encode_plus(
            ["What"],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )
        
        chunks_questions[chunk["text"]] = generated_questions[0]


    return chunks_questions

### [Salesforce/discord_qg](https://huggingface.co/Salesforce/discord_qg)

> Uses the start word 'What' for generated questions

In [171]:
def gen_bart_discord(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained("Salesforce/discord_qg")
    qg_model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/discord_qg")

    for chunk in chunks:
        encoder_ids = qg_tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = qg_tokenizer.batch_encode_plus(
            ["What"],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = qg_model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = qg_tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

### [bart-qg-nq-checkpoint](https://huggingface.co/McGill-NLP/bart-qg-nq-checkpoint)

In [172]:
def gen_bart_nq(chunks):
    chunks_questions = {}

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("McGill-NLP/bart-qg-nq-checkpoint")

    for chunk in chunks:
        encoder_ids = tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = tokenizer.batch_encode_plus(
            ["What"],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

### [bart-eqg-question-generator](https://huggingface.co/voidful/bart-eqg-question-generator)

In [173]:
def gen_bart_eqg(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained("voidful/bart-eqg-question-generator")
    qg_model = AutoModelForSeq2SeqLM.from_pretrained(
        "voidful/bart-eqg-question-generator"
    )

    for chunk in chunks:
        encoder_ids = qg_tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = qg_tokenizer.batch_encode_plus(
            ["What"],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = qg_model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = qg_tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

### [context-only-question-generator](https://huggingface.co/voidful/context-only-question-generator)

> Based on pretrained `bart-base` model

In [174]:
def gen_bart_unknown(chunks):
    chunks_questions = {}

    qg_tokenizer = AutoTokenizer.from_pretrained(
        "voidful/context-only-question-generator"
    )
    qg_model = AutoModelForSeq2SeqLM.from_pretrained(
        "voidful/context-only-question-generator"
    )

    for chunk in chunks:
        encoder_ids = qg_tokenizer.batch_encode_plus(
            [chunk["text"]],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        decoder_input_ids = qg_tokenizer.batch_encode_plus(
            ["What"],
            add_special_tokens=True,
            return_tensors="pt"
        )["input_ids"][:, :-1]
        model_output = qg_model.generate(
            **encoder_ids,
            decoder_input_ids=decoder_input_ids
        )
        generated_questions = qg_tokenizer.batch_decode(
            model_output, skip_special_tokens=True
        )

        chunks_questions[chunk["text"]] = generated_questions[0]

    return chunks_questions

### Human generated questions

> Used as the reference questions for evaluation

In [175]:
# Empty questions mean that question could not be generated
questions_list = [
    [], # 0
    [], # 1
    ["What do most smartphones have inside them nowadays?"], # 2
    ["What is the main method that humans use to interact with technology?"], # 3
    ["What does natural language mean?"], # 4
    ["What are the two main goals of natural language processing?"], # 5
    ["What is the term used to describe how people feel about a movie or product?"], # 6
    [], # 7
    [], # 8
    ["What implementation system did early NLP systems use?"], # 9
    ["Why do classical machine learning algorithms have a good trade-off?"], # 10
    [], # 11
    ["What is textual entailment?"], # 12
    ["What are the most common uses of summarization?"], # 13
    ["What common NLP systems do you interact with?"], # 14
    ["What are usages of NLP?"], # 15
    ["How does Dali generate images?"], # 16
    ["What is the general overview of NFP?"], # 17
    ["What is a document?"], # 18
    ["What are the key steps in the pre-processing pipeline for natural language processing?"], # 19
    ["Why is tokenization considered a mandatory step in the NLP pre-processing pipeline?"], # 20
    ["How does part-of-speech annotation contribute to language understanding?"], # 21
    ["Why is context important?"], # 22
    ["Why do we use lemmatization or stemming?"], # 23
    ["What is the advantage of stemming?"], # 24
    ["What are stop words?"], # 25
    ["Why do we filter words by frequency?"], # 26
    ["Why is the Eliza paper important?"], # 27
    [], # 28
    [], # 29
    [], # 30
    [], # 31
    ["How does stemming work?"], # 32
    [], # 33
    [], # 34
]

def gen_reference_questions(chunks):
    chunks_questions = {}

    for i, chunk in enumerate(chunks):
        chunks_questions[chunk["text"]] = questions_list[i]

    return chunks_questions

## Chunking

In [176]:
with open("comp3074_lecture_2.pkl", "rb") as file:
    chunks = pickle.load(file)
    time_chunks = []
    current_sentence = ""
    start_timestamp = None
    min_duration = 60

    if not chunks["chunks"][-1]["timestamp"][1]:
        chunks["chunks"][-1]["timestamp"] = (
            chunks["chunks"][-1]["timestamp"][0],
            chunks["chunks"][-1]["timestamp"][0],
        )

    for chunk in chunks["chunks"]:
        text = chunk["text"]
        timestamp = chunk["timestamp"]

        if start_timestamp is None:
            # Start a new sentence
            start_timestamp = timestamp[0]

        current_sentence += text

        sentence_completed = text.strip()[-1] in ".!?" 
        time_elapsed = (timestamp[1] - start_timestamp) >= min_duration

        # TODO: add overlap between chunks
        if sentence_completed and time_elapsed:
            time_chunks.append(
                {
                    "timestamp": (start_timestamp, timestamp[1]),
                    "text": current_sentence.strip(),
                }
            )
            current_sentence = ""
            start_timestamp = None

    if current_sentence.strip():
        time_chunks.append(
            {
                "timestamp": (start_timestamp, chunks["chunks"][-1]["timestamp"][1]),
                "text": current_sentence.strip(),
            }
        )
        
time_chunks

[{'timestamp': (0.0, 66.0),
  'text': "So in this lecture we're going to still be quite general. I'm going to talk about natural language processing and I'm going to talk in the first part I'm going to talk about what it is and in the second part I am going to oh good point in a second part I am going to talk about to introduce the concept of an NLP pipeline which is something that we'll be dealing with in the lab a little bit tomorrow and then from next Friday onwards. So first a quick overview of natural English processing. This is a topic which is usually reserved for like graduate courses in themselves so we're only even scrape really the surface of it. First a quick warm-up question and just take a minute and think can someone give me a quick estimate of how many AIs are in this room? Anyone throws out a guess?"},
 {'timestamp': (66.0, 128.0),
  'text': "I guess. Do you want individual phones or do you want like the types of hands? Well, that's the question, isn't it? Just as I ha

In [177]:
models_questions = {
    "bart_unknown": gen_bart_unknown(time_chunks),
    # "t5_squad2": gen_t5_squad2(time_chunks),
    "bart_discord": gen_bart_discord(time_chunks),
    "bart_nq": gen_bart_nq(time_chunks),
    "bart_eqg": gen_bart_eqg(time_chunks),
}

chunks_by_model = {}

for model, chunk_questions in models_questions.items():
    for chunk, question in chunk_questions.items():
        if chunk in chunks_by_model:
            chunks_by_model[chunk].update({model: question})
        else:
            chunks_by_model[chunk] = {model: question}

chunks_by_model

{"So in this lecture we're going to still be quite general. I'm going to talk about natural language processing and I'm going to talk in the first part I'm going to talk about what it is and in the second part I am going to oh good point in a second part I am going to talk about to introduce the concept of an NLP pipeline which is something that we'll be dealing with in the lab a little bit tomorrow and then from next Friday onwards. So first a quick overview of natural English processing. This is a topic which is usually reserved for like graduate courses in themselves so we're only even scrape really the surface of it. First a quick warm-up question and just take a minute and think can someone give me a quick estimate of how many AIs are in this room? Anyone throws out a guess?": {'bart_unknown': 'What is the main topic of this lecture?',
  'bart_discord': 'What is natural language processing?',
  'bart_nq': 'What is the purpose of the natural language processing lecture',
  'bart_eq

## Evaluation

> Using BERTScore

In [178]:
def evaluate_contexts_all_metrics(chunks):
    bertscore = evaluate.load("bertscore")
    reference_chunks = gen_reference_questions(time_chunks)
    output = {}
    
    for context in chunks:
        references = reference_chunks[context]
        if len(references) == 0:
            continue
    
        models = chunks[context]
        output[context] = {}

        for model in models:
            predictions = [models[model]]
            results = bertscore.compute(
                references=references, 
                predictions=predictions, 
                model_type="microsoft/deberta-xlarge-mnli"
            )
            output[context][model] = results
                

    return output


eval_scores = evaluate_contexts_all_metrics(chunks_by_model)
eval_scores

{"Basically the number of students. You say a bit more than the number of students. Anybody else? Joe. Right. I mean you're getting to the point that I was trying to get at basically both of you which is that nowadays right most smartphones have actually quite a bit of embedded AI's inside them right and you mentioned Siri that's an excellent point but it's not the only thing. So if you have that one language engine which is like what you consider me Siri but that thing contains its own kind of subsystems to protest speech and then when you take a picture your camera has some AI inside it to correct pictures and then when you store that into your photo gallery app then there is some AI recognizing images right if you go into your photo app you can see like oh I want to search for cats and then he's gonna get you pictures of cats well the phone doesn't know what a cat is right there is some system specialized on that thing to deal with it. If you use things like Google translate then yo

In [179]:
from collections import defaultdict
import statistics

def calculate_avg_std_evaluation(chunks):
    precision_by_model = defaultdict(list)
    f1_by_model = defaultdict(list)
    recall_by_model = defaultdict(list)

    for context in chunks:
        models = chunks[context]
        for model in models:
            metrics = models[model]
            precision_by_model[model].append(metrics['precision'][0])
            f1_by_model[model].append(metrics['f1'][0])
            recall_by_model[model].append(metrics['recall'][0])
    
    stats_by_model = defaultdict(dict)
    for model in precision_by_model:
        precision_avg = sum(precision_by_model[model]) / len(precision_by_model[model])
        precision_std = statistics.stdev(precision_by_model[model])
        recall_avg = sum(recall_by_model[model]) / len(recall_by_model[model])
        recall_std = statistics.stdev(recall_by_model[model])
        f1_avg = sum(f1_by_model[model]) / len(f1_by_model[model])
        f1_std = statistics.stdev(f1_by_model[model])

        stats_by_model[model] = {
            'avg_precision': precision_avg,
            'std_precision': precision_std,
            'avg_recall': recall_avg,
            'std_recall': recall_std,
            'avg_f1': f1_avg,
            'std_f1': f1_std
        }
        
    return stats_by_model

average_eval_scores = calculate_avg_std_evaluation(eval_scores)
average_eval_scores

defaultdict(dict,
            {'bart_unknown': {'avg_precision': 0.6704072753588358,
              'std_precision': 0.09123015142290129,
              'avg_recall': 0.6581563018262386,
              'std_recall': 0.10482218957264981,
              'avg_f1': 0.6622983440756798,
              'std_f1': 0.09231322315710999},
             'bart_discord': {'avg_precision': 0.7442718942960104,
              'std_precision': 0.11588179673883213,
              'avg_recall': 0.7237076982855797,
              'std_recall': 0.11921076847362633,
              'avg_f1': 0.7324004024267197,
              'std_f1': 0.11381237276421909},
             'bart_nq': {'avg_precision': 0.6297566716869673,
              'std_precision': 0.08921470427139838,
              'avg_recall': 0.6543075169126192,
              'std_recall': 0.08897197711891967,
              'avg_f1': 0.6398111867407957,
              'std_f1': 0.08082166144996884},
             'bart_eqg': {'avg_precision': 0.6997241452336311,
      

In [180]:
sorted_models = sorted(average_eval_scores.items(), key=lambda x: x[1]['avg_f1'], reverse=True)

for model, stats in sorted_models:
    print(f"Model: {model}")
    print(f"F1 Score - Average and standard deviation: ({stats['avg_f1']:.3f} ± {stats['std_f1']:.3f})")
    print(f"Precision - Average and standard deviation: ({stats['avg_precision']:.3f} ± {stats['std_precision']:.3f})")
    print(f"Recall - Average and standard deviation: ({stats['avg_recall']:.3f} ± {stats['std_recall']:.3f})")
    print("-" * 60)

Model: bart_discord
F1 Score - Average and standard deviation: (0.732 ± 0.114)
Precision - Average and standard deviation: (0.744 ± 0.116)
Recall - Average and standard deviation: (0.724 ± 0.119)
------------------------------------------------------------
Model: bart_eqg
F1 Score - Average and standard deviation: (0.694 ± 0.098)
Precision - Average and standard deviation: (0.700 ± 0.098)
Recall - Average and standard deviation: (0.691 ± 0.108)
------------------------------------------------------------
Model: bart_unknown
F1 Score - Average and standard deviation: (0.662 ± 0.092)
Precision - Average and standard deviation: (0.670 ± 0.091)
Recall - Average and standard deviation: (0.658 ± 0.105)
------------------------------------------------------------
Model: bart_nq
F1 Score - Average and standard deviation: (0.640 ± 0.081)
Precision - Average and standard deviation: (0.630 ± 0.089)
Recall - Average and standard deviation: (0.654 ± 0.089)
------------------------------------------

In [183]:
indexes_to_remove = [1, 2, 3, 8, 9, 12, 14, 15, 16, 17, 18, 28, 29, 30, 31, 33, 34]
bart_discord_questions = list(models_questions['bart_discord'].values())
bart_discord_questions = [element for i, element in enumerate(bart_discord_questions) if i not in indexes_to_remove]
bart_discord_questions

['What is natural language processing?',
 'What did the field of natural language processing try to merge insights from?',
 'What does NLP do?',
 'What are the stats from two years ago?',
 'What does natural language generis and systems do?',
 'What were the classical machine learning algorithms?',
 'What did Kaggle do?',
 'What are the most common uses of summarization?',
 'What does the pre-processing pipeline look like?',
 'What does tokenization do?',
 'What does part of speech mean?',
 'What does the grammatical context do?',
 'What is the goal of standardization?',
 'What does leimatization do?',
 'What is the last step?',
 'What do we filter words by?',
 'What was the name of the paper?',
 'What will happen when the process uses the engineering form?']

In [186]:
indexes_to_remove = [1, 2, 3, 8, 9, 12, 14, 15, 16, 17, 18, 28, 29, 30, 31, 33, 34]
bart_discord_questions = list(models_questions['bart_nq'].values())
bart_discord_questions = [element for i, element in enumerate(bart_discord_questions) if i not in indexes_to_remove]
bart_discord_questions

['What is the purpose of the natural language processing lecture',
 'What is the difference between natural language and artificial language',
 'What is the meaning of natural language understanding',
 'What is the purpose of english wikipedia',
 'What is the purpose of a chatbot',
 'What is the difference between classical and classical machine learning',
 'What is the difference between kaggle and deep learning',
 'What is the meaning of question answering in siri',
 'What is the purpose of the pre-processing pipeline',
 'What is the meaning of tokenization in poetry',
 'What is the difference between dore and cat check',
 'What is the meaning of dogs in speech',
 'What does it mean to add inflection to a sentence',
 'What is the difference between stemming and stemming',
 'What are the stop words in nfp model',
 'What is the difference between big ben and empire state building',
 'What is the meaning of unigrams in math',
 'What is the purpose of stemming in java']