##Import and Download

In [None]:
import os
from google.colab import drive

# Mount the Google drive storage
drive.mount('/content/drive')

# Move to the directory with the notebooks and the data set
# (note, you may need to change this path according to your directory structure)
os.chdir('/content/drive/MyDrive/')

# Check the currect working directory
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive'

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


##Read the Dataset

In [None]:
import json
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['data']

train_raw = load_data('train-v2.0.json')

valid_raw = load_data('dev-v2.0.json')

In [None]:
want_unanswerable = False # True: unanswerable questions are considered (Squad 2), False: thery are not cosidered (Squad 1)

In [None]:
def extract_train_dataset(data, unanswerable):
    contexts = []
    questions = []
    answers = []
    
    for article in data:
        for paragraph in article['paragraphs']: 
            for qas in paragraph['qas']:
                if qas['is_impossible'] == False: 
                    contexts.append(paragraph['context']) #duplicate contexts, s.t. I have one context-one question  
                    questions.append(qas['question'])
                    answers.append(qas['answers'][0]['text'])
                elif unanswerable == True:
                    contexts.append(paragraph['context']) #duplicate contexts, s.t. I have one context-one question  
                    questions.append(qas['question'])
                    answers.append(" ")

    return contexts, questions, answers

train_dataset_contexts, train_dataset_questions, train_dataset_answers = extract_train_dataset(train_raw, want_unanswerable)
#valid_dataset_contexts, valid_dataset_questions, valid_dataset_answers = extract_contexts_questions(valid_raw)

In [None]:
print(train_dataset_contexts[0])
print(train_dataset_questions[0])
print(train_dataset_answers[0])

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
When did Beyonce start becoming popular?
in the late 1990s


In [None]:
def extract_valid_dataset(data, unanswerable):
    dataset = []

    for article in data:
        for paragraph in article['paragraphs']:
            for qas in paragraph['qas']:
                if qas['is_impossible'] == False: 
                    list_answers = []
                    list_answers_start = []
                    for a in qas['answers']:
                        list_answers.append(a['text'])
                        list_answers_start.append(a['answer_start'])
                    dataset.append({'id': qas['id'],
                                    'context': paragraph['context'],
                                   'question': qas['question'],
                                   'answers': {'text': list_answers, 'answer_start': list_answers_start},
                    })
                elif unanswerable == True:
                    dataset.append({'id': qas['id'],
                                    'context': paragraph['context'],
                                   'question': qas['question'],
                                   'answers': {'text' : [''], 'answer_start': [0]}
                    })

    return dataset

valid_dataset = extract_valid_dataset(valid_raw, want_unanswerable)

In [None]:
valid_dataset[0]

{'id': '56ddde6b9a695914005b9628',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'In what country is Normandy located?',
 'answers': {'text': ['France', 'France', 'France', 'France'],
  'answer_start': [159, 159, 159, 159]}}

##Preprocessing

In [None]:
max_length = 512
stride = 256
num_train_epochs = 3
learning_rate = 2e-5

n_best = 20
max_answer_length = 50

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer#, AdamW

from tqdm import tqdm

# Load tokenizer and model
#model_name = "gpt2-medium"
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
class ContextQuestionDataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, index):
        context = self.contexts[index]
        question = self.questions[index]
        answer = self.answers[index]

        # Combine context and question
        input_text = f"{context} {question}" 

        # Tokenize the input text and answer
        encoding = self.tokenizer(
            input_text,
            answer,
            add_special_tokens=True,
            max_length=max_length,
            truncation="only_first",
            padding="max_length",
            return_tensors="pt",
            #truncation=True,
            return_overflowing_tokens=True,
            stride=stride
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        labels = encoding["input_ids"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Create an instance of the dataset
train_dataset = ContextQuestionDataset(train_dataset_contexts, train_dataset_questions, train_dataset_answers, tokenizer)
#valid_dataset = ContextQuestionDataset(valid_dataset_contexts, valid_dataset_questions, valid_dataset_answers, tokenizer)

##Training

In [None]:
from transformers import Trainer, TrainingArguments, default_data_collator

training_args = TrainingArguments(
    model_name+'_prova',#'_c-q_a_without_unanswerable',
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    #fp16=True #to be used when Cuda is active
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Provide your train dataset here
    eval_dataset=valid_dataset,  # Provide your validation dataset here
    data_collator=default_data_collator,  # Optional: if you need custom data collation
    tokenizer=tokenizer,  # Optional: if you need a tokenizer for text generation during evaluation
)
#trainer.train() #uncomment to train. Note that GPU is needed. If want to use CPU, comment "fp16=True" in TrainingArguments

##Evaluation

In [None]:
# import model from checkpoint
import torch

model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/distilgpt2_c-q_a_without_unanswerable/checkpoint-32559')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
context = "Natural Language Processing (NLP) concerns the computational analysis, interpretation, and production of natural language in either written or spoken form. It is an interdisciplinary research field, interesting from both theoretical and practical perspectives. Decades of research have resulted in a vast collection of symbolic, stochastic, and deep-learning based models. Such models have enable the development of applications in a vast array of fields, such as human-machine interaction and chatbots, search and question answering, translation and multilingual systems, multimodal and captioning systems, speech analysis, voice interaction and personal assistants, sentiment analysis, etc, etc. This course will provide an introduction to the important problems, models and applications in NLP. The history of NLP involves many successes and many failures, demonstrating the complexity of the topic. Initially popular symbolic models turned out to be unable to capture the intrinsic complexity of natural language. Statistical techniques such as vector-space representations and linear classifiers (e.g. Support Vector Machines) enabled important applications such as web search spam detection. Word embedding techniques then became popular and improved performance on all aspects of NLP: from morphology to semantics and dialogue. More recently sequence-to-sequence modeling with deep learning techniques have greatly improved performance on hard NLP problems such machine translation and dialog generation."
#question = "Who is the subject of the context?"
question = "Which statistical techniques are used?"
#question = "What is NLP?"
#question = "What is Natural Language Processing about?"
#input_text = f"Context: {context} Question: {question}" 
input_text = f"{context} {question}" 

input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")
attention_mask = torch.ones_like(input_ids)  # Set attention mask to 1 for all tokens
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000, do_sample=True, num_return_sequences=1)
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer[len(input_text) :])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


synthetic models and linear classifiers


###Metrics

Metrics **without** sampling

In [None]:
import evaluate
import collections
import numpy as np

predicted_answers = []
real_answers = []
for valid_group in valid_dataset:
    # set current id (this is necessary for the function that will compute the metrics)
    curr_id = valid_group['id']
    # put context and question together
    input_text = f"{valid_group['context']} {valid_group['question']}" 
    # prepare the input
    input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)  # Set attention mask to 1 for all tokens
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    # get the generated answer
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000, do_sample=False, num_return_sequences=1)
    output_string = tokenizer.decode(output[0], skip_special_tokens=True)[len(input_text):]
    # save result
    predicted_answers.append({'id' : curr_id, 'prediction_text' : output_string})
    real_answers.append({'id' : curr_id, 'answers' : valid_group['answers']})

metric = evaluate.load("squad")
metric.compute(predictions=predicted_answers, references=real_answers)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 34.446693657219974, 'f1': 46.136498474852935}

Metrics **with** sampling

In [None]:
import evaluate
import collections
import numpy as np

predicted_answers = []
real_answers = []
for valid_group in valid_dataset:
    # set current id (this is necessary for the function that will compute the metrics)
    curr_id = valid_group['id']
    # put context and question together
    input_text = valid_group['question'] 
    # prepare the input
    input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)  # Set attention mask to 1 for all tokens
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    # get the generated answer
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000, do_sample=True, num_return_sequences=1)
    output_string = tokenizer.decode(output[0], skip_special_tokens=True)[len(input_text):]
    # save result
    predicted_answers.append({'id' : curr_id, 'prediction_text' : output_string})
    real_answers.append({'id' : curr_id, 'answers' : valid_group['answers']})

metric = evaluate.load("squad")
metric.compute(predictions=predicted_answers, references=real_answers)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'exact_match': 0.45546558704453444, 'f1': 4.021009345486661}

In [None]:
print("Architure: ")
print(model)

Architure: 
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters:", total_params)

Total number of parameters: 81912576
