# Load Original Data

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import re
# from bs4 import BeautifulSoup

from datasets import DatasetDict, Dataset, load_dataset
from accelerate import Accelerator
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# We can see all columns in df.head() / and .tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_colwidth', 1000)

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_dataid = '1c206RpN1YCecrL6Hvjl52a3R16Lf3cO1'
val_dataid = '1X0RXXyTaaSUqJgbiuNzTJkB0ryZhkdwo'
test_dataid = '1Xp7zTJOVV3ZeEaqEcfl1MziJY7JcxoJh'

download = drive.CreateFile({'id': train_dataid})
download.GetContentFile('train_og.json')

download = drive.CreateFile({'id': val_dataid})
download.GetContentFile('val_og.json')

download = drive.CreateFile({'id': test_dataid})
download.GetContentFile('test_og.json')

# Convert original data to desired format for QA

In [None]:
#create two dataframes, textual and visual df
def create_text_visual_df(dataset_type):
  f = open('{dataset}_og.json'.format(dataset=dataset_type), 'r')

  read_data = json.loads(f.read())
  textual_json = [x for x in read_data['data'] if x['task'] == 'textual_cloze']
  visual_json = [x for x in read_data['data'] if x['task'] == 'visual_coherence']

  textual_df = pd.DataFrame(textual_json)
  textual_keep_col = ['recipe_id', 'context', 'choice_list', 'answer', 'question']
  textual_df = textual_df[textual_keep_col]

  visual_df = pd.DataFrame(visual_json)
  visual_keep_col = ['recipe_id', 'context']
  visual_df = visual_df[visual_keep_col]

  return textual_df, visual_df

#combine all steps into a list
def combine_all_steps(row): 
  all_steps = []
  num_steps = len(row.context)
  for step in range(num_steps):
    all_steps.append(row.context[step]['title']) #no need step number
  return all_steps

#combine textual and visual data to create combined data for full information
def combine_text_visual_df(dataset_type):
  textual_df, visual_df = create_text_visual_df(dataset_type) #replace w dataset type
  visual_df['all_steps'] = visual_df.apply(lambda row: combine_all_steps(row), axis=1)
  combined_data = pd.merge(textual_df, visual_df, how='inner', on=['recipe_id'])
  combined_data.rename(columns={'context_x': 'context'}, inplace=True)
  combined_data = combined_data[['recipe_id', 'context', 'choice_list', 'answer', 'question', 'all_steps']]
  combined_data.question = combined_data.question.apply(lambda x: [i.replace('@placeholder', '_') if i == '@placeholder' else i for i in x])
  return combined_data

#generate questions
def generate_questions(row):
  create_question = ""
  given_question = row['question']
  all_steps = row['all_steps']
  target_index = given_question.index('_') #index of question in the given list
  if target_index == 0:
    #check if it's the first step in full steps
    temp_idx = all_steps.index(given_question[target_index+1])
    if temp_idx == 1:
      create_question = "What is the first step?"
    else:
      create_question =  "What is the step after " + all_steps[temp_idx-2] + " ?"
  elif target_index == 3:
    if all_steps.index(given_question[target_index-1]) == (len(all_steps)-2): #check if the question step is the last step
      create_question = "What is the last step?"
    else:
      create_question = "What is the step after " + given_question[target_index - 1] + " ?"
  else:
    create_question = "What is the step after " + given_question[target_index - 1] + " ?"
     

  # else:
  #   create_questions = "What is the step after " + all_steps[]
    # if target_index == 0:
    #   create_question = "What is the step before " + given_question[target_index + 1] + "?"
    # else:
    #   create_question = "What is the step after " + given_question[target_index-1] + "?"

  return create_question

#generate context
def generate_full_instruction(row):
  full_instruction = ""
  context = row["context"]
  steps = row["all_steps"]
  # given_question = row['question']
  # target_index = given_question.index('_')
  
  for step in range(len(steps)):
    if step == 0:
      full_instruction += "The first step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
    elif step == (len(steps)-1):
      full_instruction += "The last step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
    else:
      # full_instruction += "After " + str(steps[step-1]) + " is step - " + str(steps[step]) + ": " + context[step]['body'] + ". "
      full_instruction += "After the previous step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
   #clean instruction 
  full_instruction = re.sub('\s+', ' ', full_instruction).strip()
  full_instruction = re.sub(r'\n', ' ', full_instruction)
  return full_instruction[0:-1]

#generate answers
def generate_answer_and_index(row):
  actual_answer = {}
  idx_list = []
  answer = row["choice_list"][row.answer]
  actual_answer["text"] = [answer]
  full_instruction = row.full_instruction

  #there are more than 1 answers, in training, only allow 1 answer! This code is only applied for training set to make sure there is one answer
  # answer_helper = "step - " + answer
  # answer_helper_idx = answer_helper.find(answer)
  # actual_answer["answer_start"] = [full_instruction.find(answer_helper) + answer_helper_idx - len('step - ')]

  #find all possible answers
  # found_answer_idx = full_instruction.find(answer)
  # idx_list.append(found_answer_idx)
  # while found_answer_idx != -1:
  #   new_start = found_answer_idx + len(answer)
  #   full_instruction = full_instruction[new_start::]
  #   found_answer_idx = full_instruction.find(answer)
  #   if found_answer_idx != -1:
  #     idx_list.append(found_answer_idx)

  actual_answer["answer_start"] = [full_instruction.find(answer)]
  return actual_answer

#combine all functions to make final data
#parameter: dataset_type: this function applies to all train/val/test set
def make_final_data(dataset_type):
  combine_data = combine_text_visual_df(dataset_type) #replace datasettype
  combine_data['full_instruction'] = combine_data.apply(lambda row: generate_full_instruction(row), axis=1)
  combine_data['new_question'] = combine_data.apply(lambda row: generate_questions(row), axis=1)
  combine_data['actual_answer'] = combine_data.apply(lambda row: generate_answer_and_index(row), axis=1)
  # combine_data['full_instruction'] = combine_data['full_instruction'].apply(lambda x: x.replace("step - ", ""))
  dup_check = combine_data[['recipe_id', 'full_instruction', 'new_question']]
  combine_data = combine_data[dup_check.duplicated() == False].reset_index(drop=True)
  final_data = combine_data[['recipe_id', 'full_instruction', 'new_question', 'actual_answer']].reset_index()
  final_data.rename(columns={'index':'id', 'recipe_id':'title', 'full_instruction':'context', 'new_question':'question', 'actual_answer':'answers'}, inplace=True)
  return final_data

In [None]:
train_df = make_final_data('train')
val_df = make_final_data('val')
test_df = make_final_data('test')

#can create a class for the train test val to apply all the converting step

In [None]:
train_df.head(5)
train_df.shape

(5259, 5)

In [None]:
context = "The first step is Veggie Prep: Tip #1 Remove seeds from jalapeños to keep it more on a medium spicy side. First prepare your veggies. Dice up tomatoes, carrots, cilantro, and romaine lettuce, set aside. Next slice up red onion and a jalapeño in half, place onto greased heated skillet and grill until softened.. After the previous step is Sauce Prep: Tip #2 Sharpen your chef’s knife before each use. Once grilled through, remove veggies from skillet and place into food processor. While you begin to make the sauce, place veggie burger onto already heated skillet, mince it up using a spatula, and grill for about five minutes until hot. Now back to the sauce…add in about a tablespoon of cilantro, 2 tablespoons of ranch, and a squeeze of lemon into the processor and blend until well incorporated. For added spice throw in a dash of Sriracha!. After the previous step is Assemble..: Tip #3 With a good sauce taco dinners go from boring to wow! Once veggie burger is cooked remove from skillet and set aside. Now place a tortilla on the skillet and grill until lightly brown on each side. Lets assemble! Place tortilla onto plate, next add on minced burger, lettuce, diced tomato, carrots, and garnish with the Kickin’ Ranch Sauce.. The last step is Suggestions!: Roll it up and serve!Suggestions: Serve with a side of quinoa for a filling side. Use the Kickin’ Ranch Sauce on any Mexican entree! Serve with a skinny margarita to complete your healthy Mexican meal ;) ***For this recipe and more like this please visit my food blog at Everythingbutfish.tumblr.com"
answer = 'Suggestions!'

print(context.find(answer))

1252


In [None]:
for i in train_df.columns:
  print(train_df[i][5250])

5250
chili-bean-rollup-with-a-kickin-ranch-sauce
The first step is Veggie Prep: Tip #1 Remove seeds from jalapeños to keep it more on a medium spicy side. First prepare your veggies. Dice up tomatoes, carrots, cilantro, and romaine lettuce, set aside. Next slice up red onion and a jalapeño in half, place onto greased heated skillet and grill until softened.. After the previous step is Sauce Prep: Tip #2 Sharpen your chef’s knife before each use. Once grilled through, remove veggies from skillet and place into food processor. While you begin to make the sauce, place veggie burger onto already heated skillet, mince it up using a spatula, and grill for about five minutes until hot. Now back to the sauce…add in about a tablespoon of cilantro, 2 tablespoons of ranch, and a squeeze of lemon into the processor and blend until well incorporated. For added spice throw in a dash of Sriracha!. After the previous step is Assemble..: Tip #3 With a good sauce taco dinners go from boring to wow! Once

In [None]:
# #try finding number of answer
# train_df['num_answer'] = train_df.answers.apply(lambda x: len(x['answer_start']))
# x = train_df.query('num_answer > 1')
# x.shape #(389,6) answer with more than 1 index in training, could def affect the performance... training should only have 1 answer

In [None]:
# train_df['num_answer'] = train_df.answers.apply(lambda x: len(x['answer_start']))
# x = train_df.query('num_answer > 1') #after applying code, no more answer with more than 1 possible answer

In [None]:
train = Dataset.from_pandas(train_df)
val = Dataset.from_pandas(val_df)
test = Dataset.from_pandas(test_df) 

full_dataset = DatasetDict({'train': train, 'val': val, 'test': test})

# START PREPROCESSING

In [None]:
from transformers import AutoTokenizer

# checkpoint = 'bert-large-uncased'
checkpoint = 'distilbert-base-uncased'
# checkpoint = 'xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
max_length = 512
stride = 64 #reduce stride by half

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    print(len(questions))
    print(len(examples['context']))

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []


    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = full_dataset['train'].map(preprocess_training_examples, batched=True, remove_columns=full_dataset['train'].column_names)
len(full_dataset['train']), len(train_dataset)

Map:   0%|          | 0/5259 [00:00<?, ? examples/s]

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
259
259


(5259, 10362)

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
validation_dataset = full_dataset["val"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=full_dataset["val"].column_names,
)
len(full_dataset["val"]), len(validation_dataset)

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

(612, 1250)

# SMALL EVAL, no need to run

In [None]:
small_eval_set = full_dataset["val"].select(range(100))
# trained_checkpoint = "distilbert-base-uncased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=full_dataset["val"].column_names,
)

# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# eval_set = small_eval_set.map(
#     preprocess_validation_examples,
#     batched=True,
#     remove_columns=full_dataset["val"].column_names,
# )

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, DistilBertForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

# accelerator = Accelerator()

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
# trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
#     device
# )

# device = accelerator.device
# batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
# trained_model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to(
#     device
# )

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to(
    device
)

# torch.cuda.max_split_size_mb(2**28)

with torch.no_grad():
    outputs = trained_model(**batch)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [None]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [None]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [None]:
# added this section

In [None]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [None]:
# theoretical_answers = [
#     {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
# ]

theoretical_answers = []
for ex in small_eval_set:
    theoretical_answers.append({
        'id': ex['id'],
        'answers': ex['answers']
    })

In [None]:
print(predicted_answers[0])
print(theoretical_answers[0])

{'id': 0, 'prediction_text': 'optional) My mom loves bacon and jelly, so when I make them for her I insert jelly. You also need..'}
{'id': 0, 'answers': {'answer_start': [1919], 'text': ['Deep Frying']}}


# METRIC FUNCTION

In [None]:
from tqdm.auto import tqdm

n_best = 10
max_answer_length = 10
predicted_answers = []

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    data = {'predictions': [], 'references': []}
    for prediction, reference in zip(predicted_answers, theoretical_answers):
      data['predictions'].append({'id': str(prediction['id']), 'prediction_text': prediction['prediction_text']})
      data['references'].append({'id': str(reference['id']), 'answers': [{'text': ans, 'answer_start': start} for ans, start in zip(reference['answers']['text'], reference['answers']['answer_start'])]})
    
    print(predicted_answers)
    print(theoretical_answers)
    return metric.compute(predictions = data['predictions'], references=data['references'])

In [None]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

  0%|          | 0/100 [00:00<?, ?it/s]

[{'id': 0, 'prediction_text': 'that'}, {'id': 1, 'prediction_text': 'black'}, {'id': 2, 'prediction_text': 'necessary.When no'}, {'id': 3, 'prediction_text': 'necessary.When no'}, {'id': 4, 'prediction_text': 'necessary.When no'}, {'id': 5, 'prediction_text': 'with non-stick cooking spray (alternatively'}, {'id': 6, 'prediction_text': 'inside posts. I could see the "grind"'}, {'id': 7, 'prediction_text': 'important'}, {'id': 8, 'prediction_text': 'egg mixture while stirring them together quickly. This is'}, {'id': 9, 'prediction_text': 'grill at a 45*'}, {'id': 10, 'prediction_text': 'days to gauge how'}, {'id': 11, 'prediction_text': 'in the Whip Cream..'}, {'id': 12, 'prediction_text': 'in the Whip Cream..'}, {'id': 13, 'prediction_text': 'in the Whip Cream..'}, {'id': 14, 'prediction_text': 'later..'}, {'id': 15, 'prediction_text': 'Until Your Arm'}, {'id': 16, 'prediction_text': 'Until Your Arm'}, {'id': 17, 'prediction_text': 'dinner. Secondly,'}, {'id': 18, 'prediction_text': 'en

{'exact_match': 1.0, 'f1': 4.128715728715728}

# HF training loop with accelerator!



In [None]:
#preparing dataset for training 

from torch.utils.data import DataLoader
from transformers import default_data_collator
import torch
from transformers import AutoModelForQuestionAnswering, DistilBertForQuestionAnswering

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8, #can we play around w this batch_size here?
)
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [None]:
print(checkpoint)

distilbert-base-uncased


In [None]:
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler


optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_checkpoint_name = "distilbert-base-finetuned-recipe-modified"
repo_name = get_full_repo_name(model_checkpoint_name)
print(repo_name)
output_dir = "distilbert-base-finetuned-recipe-modified"
# repo = Repository(output_dir, clone_from=repo_name)

saumyasinha0510/distilbert-base-finetuned-recipe-modified


In [None]:
output_dir = "distilbert-base-finetuned-recipe-modified"
repo = Repository(output_dir, clone_from=repo_name)

/content/distilbert-base-finetuned-recipe-modified is already a clone of https://huggingface.co/saumyasinha0510/distilbert-base-finetuned-recipe-modified. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
from tqdm.auto import tqdm
import torch
import evaluate
import collections

progress_bar = tqdm(range(num_training_steps))

metric = evaluate.load("squad")

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_dataset, full_dataset["val"]
    )
    print(f"epoch {epoch}:", metrics)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/3888 [00:00<?, ?it/s]

Evaluation!


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/612 [00:00<?, ?it/s]

[{'id': 0, 'prediction_text': 'Deep Frying'}, {'id': 1, 'prediction_text': 'Bake'}, {'id': 2, 'prediction_text': 'Bake Your Cookies'}, {'id': 3, 'prediction_text': 'Bake Your Cookies'}, {'id': 4, 'prediction_text': 'Frosting'}, {'id': 5, 'prediction_text': 'Serving'}, {'id': 6, 'prediction_text': 'Gently Pop Up'}, {'id': 7, 'prediction_text': 'Start Cooking'}, {'id': 8, 'prediction_text': 'Enjoy Your Delicious Meal!'}, {'id': 9, 'prediction_text': 'Grill / Smoke or Bake'}, {'id': 10, 'prediction_text': 'Gather Whey'}, {'id': 11, 'prediction_text': 'Add Sprinkles, or Not.'}, {'id': 12, 'prediction_text': 'Ingredients and Utensils'}, {'id': 13, 'prediction_text': 'Juice Your Lemons...'}, {'id': 14, 'prediction_text': 'Oil the Steamer...'}, {'id': 15, 'prediction_text': 'Baking! Finally!'}, {'id': 16, 'prediction_text': 'Baking! Finally!'}, {'id': 17, 'prediction_text': 'Snack 1'}, {'id': 18, 'prediction_text': 'Preparing the Food for Canning'}, {'id': 19, 'prediction_text': 'Materials'},

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/612 [00:00<?, ?it/s]

[{'id': 0, 'prediction_text': 'Deep Frying'}, {'id': 1, 'prediction_text': 'Bake'}, {'id': 2, 'prediction_text': 'Bake Your Cookies'}, {'id': 3, 'prediction_text': 'Bake Your Cookies'}, {'id': 4, 'prediction_text': 'Frosting'}, {'id': 5, 'prediction_text': 'Serving'}, {'id': 6, 'prediction_text': 'Gently Pop Up'}, {'id': 7, 'prediction_text': 'Start Cooking'}, {'id': 8, 'prediction_text': 'Enjoy Your Delicious Meal!'}, {'id': 9, 'prediction_text': 'Jerk Paste'}, {'id': 10, 'prediction_text': 'Gather Whey'}, {'id': 11, 'prediction_text': 'Add Sprinkles, or Not.'}, {'id': 12, 'prediction_text': 'Ingredients and Utensils'}, {'id': 13, 'prediction_text': 'Juice Your Lemons...'}, {'id': 14, 'prediction_text': 'Oil the Steamer...'}, {'id': 15, 'prediction_text': 'Baking! Finally!'}, {'id': 16, 'prediction_text': 'Baking! Finally!'}, {'id': 17, 'prediction_text': 'Snack 1'}, {'id': 18, 'prediction_text': 'Preparing the Food for Canning'}, {'id': 19, 'prediction_text': 'Materials'}, {'id': 20,

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/612 [00:00<?, ?it/s]

[{'id': 0, 'prediction_text': 'Deep Frying'}, {'id': 1, 'prediction_text': 'Bake'}, {'id': 2, 'prediction_text': 'Bake Your Cookies'}, {'id': 3, 'prediction_text': 'Bake Your Cookies'}, {'id': 4, 'prediction_text': 'Frosting'}, {'id': 5, 'prediction_text': 'Serving'}, {'id': 6, 'prediction_text': 'Gently Pop Up'}, {'id': 7, 'prediction_text': 'Start Cooking'}, {'id': 8, 'prediction_text': 'Enjoy Your Delicious Meal!'}, {'id': 9, 'prediction_text': 'Jerk Paste'}, {'id': 10, 'prediction_text': 'Gather Whey'}, {'id': 11, 'prediction_text': 'Add Sprinkles, or Not.'}, {'id': 12, 'prediction_text': 'Ingredients and Utensils'}, {'id': 13, 'prediction_text': 'Juice Your Lemons...'}, {'id': 14, 'prediction_text': 'Oil the Steamer...'}, {'id': 15, 'prediction_text': 'Baking! Finally!'}, {'id': 16, 'prediction_text': 'Baking! Finally!'}, {'id': 17, 'prediction_text': 'Snack 1'}, {'id': 18, 'prediction_text': 'Preparing the Food for Canning'}, {'id': 19, 'prediction_text': 'Materials'}, {'id': 20,

In [None]:
tokenizer.model_max_length

512

In [None]:

for i in test_df.columns:
  print(i, test[4][i])

id 4
title holy-guacachurri
context The first step is Get All Your Ingredients and Utensils Together: Strawberry Balsamic DressingServings: 4Ingredients: 3/4 pack of fresh strawberries, sliced 1/2 cup of Balsamic vinegar 1/2 cup of Olive Oil 1 Tablespoon of Dijon mustard1 teaspoon of garlic, minced Arugula Green grapes Salt & pepper to tasteUtensils: Knife Large cookie pan Mixer Spoon to taste:)GuacaChurriYes, we came up with this name as it is the result of merging Chimichurri recipe with guacamole ingredients.Servings: 41 pack of cilantro, washed 1/4 pack of parsley, washed 1 avocado, peeled 1 tablespoon of Sriracha, why not?1/4 cup of olive oil 1Tablespoon of garlic, it can be minced or whole cloves... Salt & Pepper to tasteUtensils:Knife Mixer Spoon to tasteBalsamic Vinegar "caviar" 1 cup of olive oil, refrigerated for 30 minutes 1/2 cup of balsamic vinegar 1 teaspoon of Agar Agar (link)Utensils:1 small pot 1 whisker1 pipette (link)bowl strainerGrilled Lemon SalmonServings: 41 fill

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "saumyasinha0510/distilbert-base-finetuned-recipe-modified"
question_answerer = pipeline("question-answering", model=model_checkpoint)

context = """
First, Get All Your Ingredients and Utensils Together: Strawberry Balsamic DressingServings: 4Ingredients: 3/4 pack of fresh strawberries, sliced 1/2 cup of Balsamic vinegar 1/2 cup of Olive Oil 1 Tablespoon of Dijon mustard1 teaspoon of garlic, minced Arugula Green grapes Salt & pepper to tasteUtensils: Knife Large cookie pan Mixer Spoon to taste:)GuacaChurriYes, we came up with this name as it is the result of merging Chimichurri recipe with guacamole ingredients.Servings: 41 pack of cilantro, washed 1/4 pack of parsley, washed 1 avocado, peeled 1 tablespoon of Sriracha, why not?1/4 cup of olive oil 1Tablespoon of garlic, it can be minced or whole cloves... Salt & Pepper to tasteUtensils:Knife Mixer Spoon to tasteBalsamic Vinegar "caviar" 1 cup of olive oil, refrigerated for 30 minutes 1/2 cup of balsamic vinegar 1 teaspoon of Agar Agar (link)Utensils:1 small pot 1 whisker1 pipette (link)bowl strainerGrilled Lemon SalmonServings: 41 fillet of fresh salmon 1/2 cup of leaves of basil, minced 1 tablespoon of garlic, minced 1 teaspoon of olive oil1 lemon, sliced Salt & pepper to tasteUtensils:1 sheet of aluminum foil, enough to wrap your salmonGrill. ours is a propane grill, but it is your choice. After Get All Your Ingredients and Utensils Together is Caramelize the Strawberries: For a better and stronger taste on the salad dressing, we like to caramelize the strawberries by slicing them and placing then in a cookie sheet for 20 min in the oven at 425 F.Take the sheet out of the oven, let it cool down (don't burn yourself), then scrap the strawberries with a knife. This step is very important to enhance the flavor of this amazing dressing. . After Caramelize the Strawberries is Mix Mix Mix (Strawberry Dressing): Place dijon mustard, mint leaves, salt & pepper, garlic and caramelized strawberries in the mixer bowl. Then start up your mixer and slowly add the balsamic vinegar and then the olive oil. Add a few fresh strawberries for that citric hint. Stop mixing and save it in the refrigerator until halting time. . After Mix Mix Mix (Strawberry Dressing) is Prepare and Grill Your Salmon: Once you have all your ingredients together, cut the salmon in 2 inch wide slices, this will make plating easier, prettier, nicer. Place the salmon in aluminum foil and add basil, garlic, salt & pepper, oiled oil and lemons on top. Wrap the salmon completely and place it the top of your grill for 20 min in medium/low heat. After 20 min place it on a side, but don't unwrapped to keep it warm. No one likes cold grilled salmon, right? :P. After Prepare and Grill Your Salmon is Mix, Mix, Mix "The Amazing Guacachurri": Add all the ingredients for the Guacachurri (I just like to say this word :P), in the mixer, except olive oil. Adding the olive oil have to be during mixing, and in a slow pace, so all the ingredients merge nicely into a creamy sauce. . After Mix, Mix, Mix "The Amazing Guacachurri" is Lets Make Some Magic (Balsamic "caviar"): This is one of my favorite parts, because the result is super cool and taste amazing!In here we will need to place the balsamic vinegar in a medium/low heat (our electric stove works on 4), until it start a slow boil when that happens remove the pot from the stove and add the Agar Agar and start mixing immediately with a whisker until you find a heavier texture like in picture number 5, you will be able to see the heavier balsamic in the bottom of the pot. Then pull the olive oil cup from the freezer and with the help of the pipette (we bought ours in amazon) start dropping the agar agar and balsamic mix, you will see the "bubbles" of balsamic falling down into the olive oil. When done, separate the balsamic bubbles from the olive oil with the help of a bowl and a strainer. Save your balsamic magic apart. We are ready to plate!!!. After Lets Make Some Magic (Balsamic "caviar") is Let's Plate!!! or Impress Your Guests...: Impress your guests with a professional look meal. We used a long serving plate to add the linear result. It will provide a smooth transition from every item on the plate without smooching it in a round regular plate. On this final step, you can play with the dishes to add an artistic composition. We started with the arugula, strawberry, green grapes, avocado salad, topping it with the fantastic Strawberry Dressing, then we cut a hole on a couple of strawberries because the Magic balsamic "caviar" tastes delicious with them, they are homies!!! Then, the Guacachurri to make a creamy bed for the Salmon. Oh, wait! The Salmon have to be unwrapped and it is your choice to leave the lemon slices or take them out, or just leave one as we did, because the flavor is already there. Now, you have a beautiful and full of flavor plate ideal for this hot Texan Summer. . Last, Destroy the Plate and Chill: Now it is time to eat and sit on the backyard to watch the stars and relax!!! I have added an extra picture to show another way to cook the Salmon and Guacachurri together in the oven, which turned out super good!!! Just enjoy cooking and eating delicious food with the people you love. We (the Bear and I), always enjoy cooking and enjoying life, you should do too! Thanks, and don't forget to vote for us on the contest above!!! Wink ;)


"""
question = "What is the step after Get All Your Ingredients and Utensils Together?"
question_answerer(question=question, context=context)

Downloading (…)lve/main/config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'score': 0.012190074659883976,
 'start': 1272,
 'end': 1299,
 'answer': 'Caramelize the Strawberries'}

# visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
#sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (16,8)

# LEARNING RAY TUNE

In [None]:
pip install "ray[tune]" transformers datasets scipy sklearn torch