In [1]:
!pip install transformers[setencepiece]
!pip install datasets
!pip install evaluate
!pip install accelerate

Collecting transformers[setencepiece]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[setencepiece])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[setencepiece])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[setencepiece])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/

In [2]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, TrainingArguments, Trainer, pipeline, DefaultDataCollator, AutoModelForQuestionAnswering

import tqdm
import json
from datasets import Dataset, load_dataset, DatasetDict

from google.colab import drive
drive.mount('/content/drive')
from huggingface_hub import notebook_login

Mounted at /content/drive


In [26]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Load and Concatenate Datasets
squad_df = pd.concat([pd.read_json('/content/drive/MyDrive/MSAAI - 520 Natural Language Processing/Team 4/Practice/Se\'Lina/train-v1.1.json'), pd.read_json('/content/drive/MyDrive/MSAAI - 520 Natural Language Processing/Team 4/Practice/Se\'Lina/dev-v1.1.json')]).drop(columns=["version"]).reset_index()

squad_df.head()

Unnamed: 0,index,data
0,0,"{'title': 'University_of_Notre_Dame', 'paragra..."
1,1,"{'title': 'Beyoncé', 'paragraphs': [{'context'..."
2,2,"{'title': 'Montana', 'paragraphs': [{'context'..."
3,3,"{'title': 'Genocide', 'paragraphs': [{'context..."
4,4,"{'title': 'Antibiotics', 'paragraphs': [{'cont..."


# Data Pre-Processing

In [5]:
# Separate the Data into "TEXT", "QUESTION", "ANSWER"
text = list()
question = list()
answer = list()
answer_start = list()
answer_end = list()
ids = list()

for element in squad_df['data']:
  for paragraph in element['paragraphs']:
    context = paragraph['context']
    for qa in paragraph['qas']:
      quest = qa['question']
      id = qa['id']
      for ans in qa['answers']:
        text.append(context.strip())
        question.append(quest.strip())
        answer.append(ans['text'].strip())
        answer_start.append(ans['answer_start'])
        answer_end.append(ans['answer_start'] + len(ans['text']) - 1)
        ids.append(id)

qa_df = pd.DataFrame(zip(text, question, answer, answer_start, answer_end, ids), columns=['TEXT', 'QUESTION', 'ANSWER', 'BOS', 'EOS', 'ID'])
qa_df.head()

Unnamed: 0,TEXT,QUESTION,ANSWER,BOS,EOS,ID
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,540,5733be284776f41900661182
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,212,5733be284776f4190066117f
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,295,5733be284776f41900661180
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,419,5733be284776f41900661181
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,125,5733be284776f4190066117e


In [6]:
print(f'Original Total Entries: {len(qa_df)}')
print(f'Pruned Indentical Entries New Total {len(qa_df.drop_duplicates())}')

# Drop Duplicates
original_qa_df = qa_df.drop_duplicates()
original_qa_df.head()

Original Total Entries: 122325
Pruned Indentical Entries New Total 105815


Unnamed: 0,TEXT,QUESTION,ANSWER,BOS,EOS,ID
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,540,5733be284776f41900661182
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,212,5733be284776f4190066117f
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,295,5733be284776f41900661180
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,419,5733be284776f41900661181
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,125,5733be284776f4190066117e


In [7]:
ds = Dataset.from_dict({"context": original_qa_df['TEXT'].to_list(), "question": original_qa_df["QUESTION"].to_list(), "answer_start": original_qa_df["BOS"].to_list(), "answer_end": original_qa_df["EOS"].to_list(), "answer": original_qa_df['ANSWER'].to_list(), "id":original_qa_df['ID'].to_list()})

ds.to_csv('qa.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/106 [00:00<?, ?ba/s]

93059269

In [8]:
raw_dataset = load_dataset('csv', data_files='/content/qa.csv', split="train[:10000]")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
raw_dataset

Dataset({
    features: ['context', 'question', 'answer_start', 'answer_end', 'answer', 'id'],
    num_rows: 10000
})

In [10]:
# Split the dataset into train and test sets
initial_dataset = raw_dataset.train_test_split(
    test_size=0.2,
    shuffle=True,
    seed=42
)


# Split the train dataset into train and validation sets
final_dataset = initial_dataset['train'].train_test_split(
    test_size=0.25,
    shuffle=True,
    seed=42
)

# Create a dictionary to store the datasets with keys
dataset_dict = {
    "train": final_dataset['train'],
    "test": initial_dataset['test'],
    "validation": final_dataset['test']
}

# Convert the dictionary to a Dataset object
train_test_split_data = DatasetDict(dataset_dict)

train_test_split_data

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer_start', 'answer_end', 'answer', 'id'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['context', 'question', 'answer_start', 'answer_end', 'answer', 'id'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['context', 'question', 'answer_start', 'answer_end', 'answer', 'id'],
        num_rows: 2000
    })
})

# Model Training

In [11]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")# ("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
example = raw_dataset[0]

inputs = tokenizer(
    raw_dataset[:10]['question'],
    raw_dataset[:10]['context'],
    max_length=100,
    truncation="only_second",
    stride=40,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [13]:
print(f"The 10 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 10 examples gave 41 features.
Here is where each comes from: [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9].


In [14]:
for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] to whom did the virgin mary allegedly appear in 1858 in lourdes france? [SEP] architecturally, the school has a catholic character. atop the main building's gold dome is a golden statue of the virgin mary. immediately in front of the main building and facing it, is a copper statue of christ with arms upraised with the legend " venite ad me omnes ". next to the main building is the basilica of the sacred heart. immediately behind the basilica is the gr [SEP]
[CLS] to whom did the virgin mary allegedly appear in 1858 in lourdes france? [SEP] of christ with arms upraised with the legend " venite ad me omnes ". next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto, a marian place of prayer and reflection. it is a replica of the grotto at lourdes, france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858. [SEP]
[CLS] to whom did the virgin mary allegedly appear in 1858 in lourdes france? [SEP] a marian 

In [15]:
answers = raw_dataset[:10]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    start_char = answers["answer_start"][sample_idx]
    end_char = answers["answer_end"][sample_idx]
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

print(start_positions, end_positions)

[0, 88, 46, 52, 0, 0, 81, 39, 0, 0, 47, 0, 33, 0, 0, 63, 17, 0, 0, 0, 98, 53, 0, 0, 0, 0, 77, 31, 0, 0, 39, 0, 0, 0, 0, 0, 0, 98, 56, 0, 0] [0, 95, 53, 56, 0, 0, 83, 41, 0, 0, 53, 0, 39, 0, 0, 64, 18, 0, 0, 0, 98, 53, 0, 0, 0, 0, 78, 32, 0, 0, 39, 0, 0, 0, 0, 0, 0, 98, 56, 0, 0]


In [16]:
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers["answer"][sample_idx]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

Theoretical answer: Saint Bernadette Soubirous, labels give: [CLS]


In [17]:
idx = 2
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers["answer"][sample_idx]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

Theoretical answer: Saint Bernadette Soubirous, labels give: saint bernadette soubirous


Following Adapted From: https://huggingface.co/docs/transformers/tasks/question_answering

In [18]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples['question'],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = examples['answer'][i]
        start_char = examples['answer_start'][i]
        end_char = examples['answer_end'][i]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [19]:
train_dataset = train_test_split_data["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_split_data["train"].column_names,
)
len(train_test_split_data["train"]), len(train_dataset)


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

(6000, 6000)

In [20]:
validation_dataset = train_test_split_data["validation"].map(
    preprocess_function,
    batched=True,
    remove_columns=train_test_split_data["validation"].column_names,
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [21]:
data_collator = DefaultDataCollator()

In [22]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
torch.cuda.empty_cache()

In [28]:
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.577929
2,1.747900,1.475724
3,1.131400,1.509644


TrainOutput(global_step=1125, training_loss=1.3887537163628472, metrics={'train_runtime': 783.229, 'train_samples_per_second': 22.982, 'train_steps_per_second': 1.436, 'total_flos': 1763816850432000.0, 'train_loss': 1.3887537163628472, 'epoch': 3.0})

In [29]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

'https://huggingface.co/sglasher/my_awesome_qa_model/tree/main/'