In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load COVID-QA dataset

In [None]:
import json

# Replace 'COVID-QA.json' with the path to your actual JSON file
file_path = 'COVID-QA.json'

# Load the JSON data
with open(file_path, 'r') as f:
    data = json.load(f)

for key in data.keys():
    print(f"Key: {key}, Type of value: {type(data[key])}")

Key: data, Type of value: <class 'list'>


In [None]:
def print_structure(element, depth=0):
    indent = "  " * depth  # Increase indentation with depth
    if isinstance(element, dict):
        print(f"{indent}Dictionary with keys: {list(element.keys())}")
        for key, value in element.items():
            print(f"{indent}Key: '{key}' structure:")
            print_structure(value, depth + 1)  # Recursive call with increased depth
    elif isinstance(element, list):
        print(f"{indent}List with {len(element)} elements. Structure of first element:")
        if len(element) > 0:
            print_structure(element[0], depth + 1)  # Recursive call with increased depth
    else:
        print(f"{indent}Value of type: {type(element)}")

# Use the function on your loaded data
print_structure(data)

Dictionary with keys: ['data']
Key: 'data' structure:
  List with 147 elements. Structure of first element:
    Dictionary with keys: ['paragraphs']
    Key: 'paragraphs' structure:
      List with 1 elements. Structure of first element:
        Dictionary with keys: ['qas', 'context', 'document_id']
        Key: 'qas' structure:
          List with 11 elements. Structure of first element:
            Dictionary with keys: ['question', 'id', 'answers', 'is_impossible']
            Key: 'question' structure:
              Value of type: <class 'str'>
            Key: 'id' structure:
              Value of type: <class 'int'>
            Key: 'answers' structure:
              List with 1 elements. Structure of first element:
                Dictionary with keys: ['text', 'answer_start']
                Key: 'text' structure:
                  Value of type: <class 'str'>
                Key: 'answer_start' structure:
                  Value of type: <class 'int'>
            Key: 'is_im

In [None]:
import json

# Replace 'COVID-QA.json' with the path to your actual JSON file
file_path = 'COVID-QA.json'

# Function to load the dataset and display the first 3 entries
def display_first_entries(file_path, num_entries=3):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Assuming the structure is similar to SQuAD
    for item in data['data'][:num_entries]:  # Only go through the first 'num_entries' items
        title = item.get('title', 'No Title')  # Some datasets might not have a title
        print(f"Title: {title}")
        for paragraph in item['paragraphs']:
            context = paragraph['context']
            print(f"Context: {context[:1000]}...")  # Displaying a part of the context for brevity
            for qa in paragraph['qas']:
                question = qa['question']
                answer_text = qa['answers'][0]['text']
                answer_start = qa['answers'][0]['answer_start']
                print(f"  Q: {question}")
                print(f"  A: {answer_text} (Starts at: {answer_start})")
            print("\n---\n")

# Call the function with your file path
display_first_entries(file_path)

Title: No Title
Context: Functional Genetic Variants in DC-SIGNR Are Associated with Mother-to-Child Transmission of HIV-1

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2752805/

Boily-Larouche, Geneviève; Iscache, Anne-Laure; Zijenah, Lynn S.; Humphrey, Jean H.; Mouland, Andrew J.; Ward, Brian J.; Roger, Michel
2009-10-07
DOI:10.1371/journal.pone.0007211
License:cc-by

Abstract: BACKGROUND: Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. Given that the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1. METHODS AND FINDINGS: To investigate the potential role of DC-SIGNR in MTCT of HIV-1, we carried out a genetic association study of DC-SIGNR in a well-characterize

In [None]:
from collections import OrderedDict
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, Value, Sequence

def trim_context(context, answer, answer_start):
    window_size = 500  # characters
    start_pos = max(0, answer_start - window_size)
    end_pos = min(len(context), answer_start + len(answer) + window_size)
    new_context = context[start_pos:end_pos]
    new_answer_start = new_context.find(answer)
    return new_context, new_answer_start

def load_covid_qa_dataset(file_path):
    with open(file_path, 'r') as f:
        covid_qa_data = json.load(f)

    structured_data = []

    for item in covid_qa_data['data']:
        for paragraph in item['paragraphs']:
            original_context = paragraph['context']

            for qa in paragraph['qas']:
                trimmed_context, new_answer_start = trim_context(original_context, qa['answers'][0]['text'], qa['answers'][0]['answer_start'])

                # Using OrderedDict to maintain order
                qa_dict = OrderedDict([
                    ("answers", {
                        "text": [qa['answers'][0]['text']],
                        "answer_start": [new_answer_start]
                    }),
                    ("context", trimmed_context),
                    ("id", str(qa.get("id", "N/A"))),
                    ("question", qa['question']),
                    ("title", item.get("title", "N/A"))
                ])

                structured_data.append(qa_dict)

    return structured_data

covid_qa_dataset = load_covid_qa_dataset('COVID-QA.json')

# Assuming you're using the Hugging Face 'datasets' library to create the dataset

features = Features({
    'answers': Sequence(feature={
        'answer_start': Value('int32'),
        'text': Value('string')
    }),
    'context': Value('string'),
    'id': Value('string'),
    'question': Value('string'),
    'title': Value('string')
})

hf_dataset = Dataset.from_list(covid_qa_dataset, features=features)

train_test_split = hf_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']


In [None]:
train_test_split

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 1615
    })
    test: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 404
    })
})

In [None]:
val_dataset= train_test_split['test']


In [None]:
train_dataset[0]

{'answers': {'answer_start': [500],
  'text': ['YFV (YF-17D strain) was the first to be licensed for use in humans, where the cDNAs encoding the envelope proteins of YFV were replaced with the corresponding genes of an attenuated Japanese encephalitis virus strain, SA14-14-2']},
 'context': 'gainst the rabies virus in wildlife) and RNA viruses [such as Newcastle disease virus-based vaccines to be used in poultry or yellow fever virus (YFV)-based vaccines to be used in horses against West Nile virus] (Draper & Heeney, 2010) . Based on the safety record in the veterinary field, many viruses have been studied for human use as a vector in vaccine development (Beukema et al., 2006; Esteban, 2009; Schirrmacher & Fournier, 2009; Stoyanov et al., 2010; Weli & Tryland, 2011) . Amongst them, YFV (YF-17D strain) was the first to be licensed for use in humans, where the cDNAs encoding the envelope proteins of YFV were replaced with the corresponding genes of an attenuated Japanese encephalitis vir

## Preprocess

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_covid_qa_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_train_dataset = train_dataset.map(
    preprocess_covid_qa_function,
    batched=True,
    remove_columns=['question', 'context', 'answers']

tokenized_val_dataset = val_dataset.map(
    preprocess_covid_qa_function,
    batched=True,
    remove_columns=['question', 'context', 'answers']
)

Map:   0%|          | 0/1615 [00:00<?, ? examples/s]

Map:   0%|          | 0/404 [00:00<?, ? examples/s]

In [None]:
train_dataset

In [None]:
train_dataset['question'][0]

In [None]:
train_dataset['context'][0]

In [None]:
train_dataset['answers'][0]

In [None]:
val_dataset

In [None]:
tokenized_train_dataset

In [None]:
tokenized_val_dataset

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

## Train

In [None]:
from transformers import create_optimizer

#batch_size = 16, took too long
batch_size = 8
num_epochs = 2

total_train_steps = (len(tokenized_train_dataset) // batch_size) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_dataset,
    shuffle=True,
    #batch_size=16, took too long
    batch_size=8,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_val_dataset,
    shuffle=False,
    #batch_size=16, took too long
    batch_size=8,
    collate_fn=data_collator,
)

In [None]:
tf_train_set

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="my_awesome_qa_model",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/rezabarati/my_awesome_qa_model into local empty directory.


Download file tf_model.h5:   0%|          | 8.00k/253M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/253M [00:00<?, ?B/s]

In [None]:
tf_train_set, tf_validation_set

In [None]:
# initially epochs was set to 3
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=[callback], verbose=1)

Epoch 1/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/2


<tf_keras.src.callbacks.History at 0x787f19577b80>

## Inference

In [None]:
question = "What is the main cause of HIV-1 infection in children?"
context = """
Functional Genetic Variants in DC-SIGNR Are Associated with Mother-to-Child Transmission of HIV-1

Abstract: BACKGROUND: Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. Given that the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1. METHODS AND FINDINGS: To investigate the potential role of DC-SIGNR in MTCT of HIV-1, we carried out a genetic association study of DC-SIGNR in a well-characterized cohort of 197 HIV-infected mothers and their infants recruited in Harare, Zimbabwe. Infants harbouring two copies of DC-SIGNR H1 and/or H3 haplotypes (H1-H1, H1-H3, H3-H3) had a 3.6-fold increased risk of in utero (IU) HIV-1 infection and a 5.7-fold increased risk of intrapartum (IP) HIV-1 infection after adjusting for a number of maternal factors. The implicated H1 and H3 haplotypes share two single nucleotide polymorphisms (SNPs) in promoter region (p-198A) and intron 2 (int2-180A) that were associated with increased risk of both IU and IP HIV-1 infection. The promoter variant reduced transcriptional activity in vitro. In homozygous H1 infants bearing both the p-198A and int2-180A mutations, we observed a 4-fold decrease in the level of placental DC-SIGNR transcripts, disproportionately affecting the expression of membrane-bound isoforms compared to infant noncarriers (P = 0.011).
"""

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="Covid-QA")
question_answerer(question=question, context=context)

Some layers from the model checkpoint at Covid-QA were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at Covid-QA and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.04487999528646469,
 'start': 1207,
 'end': 1216,
 'answer': 'int2-180A'}

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Covid-QA")
inputs = tokenizer(question, context, return_tensors="tf")

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("Covid-QA")
outputs = model(**inputs)

In [None]:
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)