## Installation Reqs (Linux Ubuntu)
1. <code> python3 -m pip install playwright
playwright install </code> 
2. <code> pip install datasets <code>
3. <code> pip install transformers <code>
4. <code> wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.0-550.54.14-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.0-550.54.14-1_amd64.deb
sudo cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-4 <code>

## Webscraping 

In [2]:
## WEB SCRAPING CELL: 
from playwright.async_api import async_playwright
import asyncio

async def process_locator(locator):
    count = await locator.count()
    if count > 1: #We have many elements and must resolve each inner text
        texts = ""
        for i in range(count):
            element = locator.nth(i)
            if await element.is_visible():
                inner_text = await element.inner_text()
                texts = texts + "," + inner_text
                return texts
    else:
        if await locator.is_visible():
            return await locator.inner_text()
        else:
            return "NA"
    

async def main():
   async with async_playwright() as pw:
       browser = await pw.chromium.launch(
           ##We'll employ the use of chromium for this webscraper
           ##Using a proxy creates HTTP errors.
          headless=False
      )

       #Beginning page: 
       page = await browser.new_page()
       await page.goto('https://world.openfoodfacts.org/')
       await page.wait_for_timeout(5000)
       result = []
       food_urls = []
       food_list = await page.query_selector_all('.list_product_a')
       for food in food_list:
           food_urls.append(await food.get_attribute('href'))
           
       for food_url in food_urls:
            food_info = {}
            await page.goto(food_url)
            #Title: 
            title = page.locator(".title-1")
            food_info['title'] = await process_locator(title)
            #Common Name:
            common_name = page.locator("#field_generic_name_value")
            food_info['common_name'] = await process_locator(common_name)
            #Quantity:
            quantity = page.locator("#field_quantity_value")
            food_info['quantity'] = await process_locator(quantity)
            #Packaging: 
            packaging = page.locator("#field_packaging_value")
            food_info['packaging'] = await process_locator(packaging)
            #Brands:
            brand = page.locator("#field_brands_value")
            food_info['brand'] = await process_locator(brand)
            #Categories:
            categories = page.locator("#field_categories_value")
            food_info['categories'] = await process_locator(categories)
            #Certifications:
            certifications = page.locator("#field_labels_value")
            food_info['certifications'] = await process_locator(certifications)
            #Origin:
            origin = page.locator("#field_origin_value")
            food_info['origin'] = await process_locator(origin)
            #origin of ingredients:
            origin_of_ingredients = page.locator("#field_origins_value")
            food_info['origin_of_ingredients'] = await process_locator(origin_of_ingredients)
            #Places of manufacturing:
            places_manufactured = page.locator("#field_manufacturing_places_value")
            food_info['places_manufactured'] = await process_locator(places_manufactured)
            #Stores:
            stores = page.locator("#field_stores_value")
            food_info['stores'] = await process_locator(stores)
            #Countries where Sold:
            countries_sold = page.locator("#field_countries_value")
            food_info['countries_sold'] = await process_locator(countries_sold)
           
            #HEALTH SECTION
            #Notice, because of the increasing complexity of the DOM elements in this area the CSS selectors don't follow a similarly nice pattern
            #Ingredients: 
            ingredients = page.locator("#panel_ingredients_content .panel_text")
            food_info['ingredients'] = await process_locator(ingredients)
            #NOVA score:
            nova_score = page.locator("ul#panel_nova li.accordion-navigation h4")
            food_info['nova_score'] = await process_locator(nova_score)
            # #Palm Status:
            # palm_status = page.locator(".accordion-navigation active .content panel_content active .panel_text")
            # food_info['palm_status'] = await process_locator(palm_status)
            # #Vegan Status:
            # vegan_status = page.locator("#panel_ingredients_analysis_en-vegan_content .panel_text")
            # food_info['vegan_status'] = await process_locator(vegan_status)
            # #Vegetarian Status:
            # vegetarian_status = page.locator("#panel_ingredients_analysis_en-vegetarian_content .panel_text")
            # food_info['vegetarian_status'] = await process_locator(vegetarian_status)
            #Nutrition grade:
            nutrition_grade = page.locator(".accordion-navigation .grade_a_title")
            food_info['nutrition_grade'] = await process_locator(nutrition_grade)

            # #NUTITRION FACTS
            # #
            # table_rows = await page.query_selector_all("#panel_nutrition_facts_table_content")
            # nutrition_facts = {}
            # for row in table_rows:
            #     columns = await row.query_selector_all('td')
            #     name = await process_locator(columns[0])
            #     value_per_100g = await process_locator(columns[1])
            #     nutrition_facts[name] = {
            #         "100g/100ml": value_per_100g
            #     }
                    
                    
            # food_info['nutrition_table'] = nutrition_facts
            result.append(food_info)
            


       
       

       
           
           
           
       await browser.close()
       return result
if __name__ == '__main__':
   result = await main()

#Problems & Changes:
#

#CITATIONs: 
#Code cited from OxyLabs: https://github.com/oxylabs/playwright-web-scraping?tab=readme-ov-file
#,https://playwright.dev/python/docs/locators

In [3]:
result[10]

{'title': "Huile d'olive vierge extra - Domaine de Bournissac - 500\xa0ml",
 'common_name': "Huile d'olive",
 'quantity': '500 ml',
 'packaging': 'NA',
 'brand': 'Domaine de Bournissac',
 'categories': 'Plant-based foods and beverages, Plant-based foods, Fats, Vegetable fats, Olive tree products, Vegetable oils, Olive oils, Extra-virgin olive oils, Virgin olive oils',
 'certifications': 'Organic, EU Organic, FR-BIO-10\n',
 'origin': 'NA',
 'origin_of_ingredients': 'NA',
 'places_manufactured': 'NA',
 'stores': 'NA',
 'countries_sold': 'France',
 'ingredients': 'NA',
 'nova_score': 'Processed culinary ingredients',
 'nutrition_grade': 'NA'}

## Preprocessing data

In [4]:
#PREPROCESSING CELL
from datasets import DatasetDict, Dataset, Features
##DATA FORMAT:
# Context: 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
# Question: 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'
# Answer: {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}

#Defining a function to create the context, in the context part of a QA data format
def create_context(food_data):
    context = (
        f"{food_data['title']} is commonly known as {food_data['common_name']}. "
        f"The ingredients are {food_data['ingredients']}. "
        f"The packaging includes {food_data['packaging']}. "
        f"The brand is {food_data['brand']}. "
        f"It falls under the categories {food_data['categories']}. "
        f"It has certifications like {food_data['certifications']}. "
        f"It originates from {food_data['origin']} and the origin of ingredients is {food_data['origin_of_ingredients']}. "
        f"It is manufactured in {food_data['places_manufactured']}. "
        f"It is sold in countries like {food_data['countries_sold']}. "
        f"The nutrition grade is {food_data['nutrition_grade']}. "
        f"The NOVA score is {food_data['nova_score']}. "
        f"It can be found in stores such as {food_data['stores']}. "
    )
    return context



def create_question_answer(food_data, context):
    qa_pairs = []

    #Generate question about common name 
    question = "What is the common name of " + food_data['title'] + "?"
    answer = {'text': [food_data['common_name']], 'answer_start':[context.index(food_data['common_name'])]}
    qa_pairs.append({'question':question, 'answer':answer})
    #Generate question about ingredients
    question = "What are some ingredients in " + food_data['title'] + "?"
    answer = {'text':[food_data['ingredients']], 'answer_start':[context.index(food_data['ingredients'])]}
    qa_pairs.append({'question':question, 'answer':answer})
    #Generate question 
    question = "What is the packaging of " + food_data['title'] + "?"
    answer = {'text':[food_data['packaging']], 'answer_start':[context.index(food_data['packaging'])]}
    qa_pairs.append({'question':question, 'answer':answer})
    #Generate Question
    question = "What is the brand of " + food_data['title'] + "?"
    answer = {'text':[food_data['brand']], 'answer_start':[context.index(food_data['brand'])]}
    qa_pairs.append({'question':question, 'answer':answer})
    #Generate Question
    
    return qa_pairs

def create_qac_dataset(food_data_list):
    qac_dataset = []
    for food_data in food_data_list:
        context = create_context(food_data)
        qa_pairs = create_question_answer(food_data, context)
        for qa in qa_pairs:
            current_dict = {"context": context, "question": qa['question'], "answer": qa['answer']}
            qac_dataset.append(current_dict)
    return qac_dataset
        
   #As of the current function, it is a deterministic split, this is done for debugging purposes
def split_dataset(dataset, split_ratio=0.8):
    split_index = int(len(dataset) * split_ratio)
    
    training_data = dataset[:split_index]
    validation_data = dataset[split_index:]
    
    return {"train": training_data, "validation": validation_data}



#This is the format found on the Hugging Face tutorial, following this contruction for simplicity
def convert_to_dataset_dict(training_set, validation_set):
    features = Features({
        "id": "string",
        "title": "string",
        "context": "string",
        "question": "string",
        "answers": "string",
    })

    # Create Dataset objects
    train_dataset = Dataset.from_pandas(training_set)
    validation_dataset = Dataset.from_pandas(validation_set)

    # Create DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": validation_dataset
    })

    return dataset_dict



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Create our question, answer, context dictionary
qac_result = create_qac_dataset(result)

In [6]:
#The documentation on Hugging face suggests that this is the best format
idx = 0
for qa in qac_result:
    qa["id"] = idx + 1  # Adding 1 to start id from 1
    qa["title"] = f"Title {idx + 1}"  # Assuming title follows a pattern, adjust as needed

    # Move id and title to the beginning of the dictionary
    qa.update({"id": qa["id"], "title": qa["title"]})
    idx += 1

In [7]:
qac_result[0]

{'context': "Eau de Source - Cristaline - 1,5\xa0L is commonly known as Spring water. The ingredients are water. The packaging includes Aluminium-can, HdpeFilm-packet, PpFilm-wrapper, Ldpe-film. The brand is Cristaline. It falls under the categories Beverages, Waters, Spring waters. It has certifications like Triman\n. It originates from Embouteillée à 24610 Saint-Martin de Gurson France and the origin of ingredients is France, fr:Saint-Martin de Gurson. It is manufactured in Saint-Martin de Gurson, France, 24610. It is sold in countries like Belgium, Côte d'Ivoire, France, Germany, Guadeloupe, Italy, Luxembourg, Mali, Martinique, New Caledonia, Switzerland, United Kingdom. The nutrition grade is Very good nutritional quality. The NOVA score is Unprocessed or minimally processed foods. It can be found in stores such as Carrefour, Leclerc, Auchan, Intermarché, Super U, E.Leclerc. ",
 'question': 'What is the common name of Eau de Source - Cristaline - 1,5\xa0L?',
 'answer': {'text': ['S

In [8]:
import pandas as pd
#Let us split the dataset into training, and validation
qac_dataset = split_dataset(qac_result, split_ratio=0.8)
training_list = qac_dataset["train"]
validation_list = qac_dataset["validation"]
df_training = pd.DataFrame(training_list)
df_validation = pd.DataFrame(validation_list)
df_training

Unnamed: 0,context,question,answer,id,title
0,"Eau de Source - Cristaline - 1,5 L is commonly...",What is the common name of Eau de Source - Cri...,"{'text': ['Spring water'], 'answer_start': [56]}",1,Title 1
1,"Eau de Source - Cristaline - 1,5 L is commonly...",What are some ingredients in Eau de Source - C...,"{'text': ['water'], 'answer_start': [63]}",2,Title 2
2,"Eau de Source - Cristaline - 1,5 L is commonly...",What is the packaging of Eau de Source - Crist...,"{'text': ['Aluminium-can, HdpeFilm-packet, PpF...",3,Title 3
3,"Eau de Source - Cristaline - 1,5 L is commonly...",What is the brand of Eau de Source - Cristalin...,"{'text': ['Cristaline'], 'answer_start': [16]}",4,Title 4
4,Prince Chocolat biscuits au blé complet - Lu -...,What is the common name of Prince Chocolat bis...,{'text': ['BISCUITS FOURRÉS (35%) PARFUM CHOCO...,5,Title 5
...,...,...,...,...,...
315,Levure de bière - Gerblé - 150 g is commonly k...,What is the brand of Levure de bière - Gerblé ...,"{'text': ['Gerblé'], 'answer_start': [18]}",316,Title 316
316,Granola - LU - 200 g e is commonly known as Bi...,What is the common name of Granola - LU - 200 ...,{'text': ['Biscuits sablés nappés de chocolat ...,317,Title 317
317,Granola - LU - 200 g e is commonly known as Bi...,What are some ingredients in Granola - LU - 20...,"{'text': [',wheat flour 48%, milk chocolate 27...",318,Title 318
318,Granola - LU - 200 g e is commonly known as Bi...,What is the packaging of Granola - LU - 200 g e?,"{'text': ['fr:sachet plastique, fr:étui carton...",319,Title 319


In [9]:
dataset = convert_to_dataset_dict(df_training, df_validation)

In [11]:
#Based on the tutorial: https://huggingface.co/learn/nlp-course/en/chapter7/7 we'll use the bert-base-cased model
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#Based on the tutorial we want to insert tokens create a sentence of this form: 
#[CLS] question [SEP] context [SEP]

## Begin Testing preproccess

In [12]:
#Test tokenizer format
#See the format of splitting the context
context = dataset["train"][0]['context']
question = dataset["train"][0]['question']

inputs = tokenizer(question, 
                   context, 
                   max_length = 100, 
                   truncation="only_second", 
                   stride = 50, 
                   return_overflowing_tokens=True,)
for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] What is the common name of Eau de Source - Cristaline - 1, 5 L? [SEP] Eau de Source - Cristaline - 1, 5 L is commonly known as Spring water. The ingredients are water. The packaging includes Aluminium - can, HdpeFilm - packet, PpFilm - wrapper, Ldpe - film. The brand is Cristaline. It falls under the categories Beverages, [SEP]
[CLS] What is the common name of Eau de Source - Cristaline - 1, 5 L? [SEP] The packaging includes Aluminium - can, HdpeFilm - packet, PpFilm - wrapper, Ldpe - film. The brand is Cristaline. It falls under the categories Beverages, Waters, Spring waters. It has certifications like Triman. It originates from Embouteillée à 24610 Saint - [SEP]
[CLS] What is the common name of Eau de Source - Cristaline - 1, 5 L? [SEP], Ldpe - film. The brand is Cristaline. It falls under the categories Beverages, Waters, Spring waters. It has certifications like Triman. It originates from Embouteillée à 24610 Saint - Martin de Gurson France and the origin of ingredients is F

In [13]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [14]:
inputs = tokenizer(
    dataset["train"][2:6]["question"],
    dataset["train"][2:6]["context"],
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 53 features.
Here is where each comes from: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3].


In [15]:
answers = dataset["train"][2:6]["answer"]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

start_positions, end_positions

([51,
  24,
  0,
  0,
  0,
  0,
  0,
  0,
  27,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  51,
  30,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [79,
  52,
  0,
  0,
  0,
  0,
  0,
  0,
  30,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  73,
  52,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

In [16]:
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

Theoretical answer: Aluminium-can, HdpeFilm-packet, PpFilm-wrapper, Ldpe-film, labels give: Aluminium - can, HdpeFilm - packet, PpFilm - wrapper, Ldpe - film


In [17]:
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")

Theoretical answer: Aluminium-can, HdpeFilm-packet, PpFilm-wrapper, Ldpe-film, decoded example: [CLS] What is the packaging of Eau de Source - Cristaline - 1, 5 L? [SEP] and the origin of ingredients is France, fr : Saint - Martin de Gurson. It is manufactured in Saint - Martin de Gurson, France, 24610. It is sold in countries like Belgium, Côte d'Ivoire, France, Germany, Guadeloupe, Italy, Luxembourg, Mali, Martinique, New Caledonia, Switzerland, United Kingdom. [SEP]


## End Testing Preprocessing

In [18]:
#Function cited from: https://huggingface.co/learn/nlp-course/en/chapter7/7
#Processing the training data
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [19]:
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
len(dataset["train"]), len(train_dataset)

Map: 100%|███████████████████████████████████████████████| 320/320 [00:00<00:00, 2177.75 examples/s]


(320, 392)

In [20]:
#Function cited directly from:https://huggingface.co/learn/nlp-course/en/chapter7/7
#Processing the validation Data
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [22]:
validation_dataset = dataset["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
len(dataset["validation"]), len(validation_dataset)

Map: 100%|█████████████████████████████████████████████████| 80/80 [00:00<00:00, 3385.13 examples/s]


(80, 116)

## Fine-Tuning the Model

In [23]:
small_eval_set = dataset["validation"].select(range(40))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)

Map: 100%|█████████████████████████████████████████████████| 40/40 [00:00<00:00, 3587.48 examples/s]


In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [28]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

ModuleNotFoundError: No module named 'torch'