In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-c

In [2]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
# from bs4 import BeautifulSoup

from datasets import DatasetDict, Dataset, load_dataset
from sklearn.model_selection import train_test_split
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# We can see all columns in df.head() / and .tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_colwidth', None)

In [3]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#original RecipeQA
train_dataid = '1c206RpN1YCecrL6Hvjl52a3R16Lf3cO1'
val_dataid = '1X0RXXyTaaSUqJgbiuNzTJkB0ryZhkdwo'
test_dataid = '1Xp7zTJOVV3ZeEaqEcfl1MziJY7JcxoJh'

download = drive.CreateFile({'id': train_dataid})
download.GetContentFile('train_og.json')

download = drive.CreateFile({'id': val_dataid})
download.GetContentFile('val_og.json')

download = drive.CreateFile({'id': test_dataid})
download.GetContentFile('test_og.json')

# Convert original data to desired format for QA

In [4]:
def make_text_visual_df(dataset_type):
  f = open('{dataset}_og.json'.format(dataset=dataset_type), 'r')

  read_data = json.loads(f.read())
  textual_json = [x for x in read_data['data'] if x['task'] == 'textual_cloze']
  visual_json = [x for x in read_data['data'] if x['task'] == 'visual_coherence']

  textual_df = pd.DataFrame(textual_json)
  textual_keep_col = ['recipe_id', 'context', 'choice_list', 'answer', 'question']
  textual_df = textual_df[textual_keep_col]

  visual_df = pd.DataFrame(visual_json)
  visual_keep_col = ['recipe_id', 'context']
  visual_df = visual_df[visual_keep_col]

  return textual_df, visual_df

def combine_all_steps(row): 
  all_steps = []
  num_steps = len(row.context)
  for step in range(num_steps):
    all_steps.append("Step " + str(step+1) + ": " + row.context[step]['title'])
  return all_steps

def combine_text_visual_df(dataset_type):
  textual_df, visual_df = make_text_visual_df(dataset_type) #replace w dataset type
  visual_df['all_steps'] = visual_df.apply(lambda row: combine_all_steps(row), axis=1)
  train_data = pd.merge(textual_df, visual_df, how='inner', on=['recipe_id'])
  train_data.rename(columns={'context_x': 'context'}, inplace=True)
  train_data = train_data[['recipe_id', 'context', 'choice_list', 'answer', 'question', 'all_steps']]
  train_data.question = train_data.question.apply(lambda x: [i.replace('@placeholder', '_') if i == '@placeholder' else i for i in x])
  return train_data

def combine_body_with_step(row):
  full_instruction = ""
  context = row["context"]
  steps = row["all_steps"]
  for step in range(len(steps)):
    full_instruction += str(steps[step]) + ": " + context[step]['body'] + ". "
  return full_instruction[0:-1]

def generate_questions(row):
  create_question = ""
  given_question = row['question']
  target_index = given_question.index('_')
  #two ways to create question... not sure if one is better than another
  #feel free to suggest ways to create questions :)
  if target_index == 0 or target_index == 2:
    create_question = "What is the step before " + given_question[target_index + 1] + "?"
  else:
    create_question = "What is the step after " + given_question[target_index-1] + "?"
  # if target_index == 0:
  #   create_question = "What is the step before " + given_question[target_index+1] + "?"
  # elif target_index == 3:
  #   create_question = "What is the step after " + given_question[target_index-1] + "?"
  # else:
  #   create_question = "What is the step after " + given_question[target_index-1] + " and before " + given_question[target_index+1] + "?"
  return create_question

def generate_answer_and_index(row):
  actual_answer = {}
  answer = row["choice_list"][row.answer]
  actual_answer["text"] = [answer]
  full_instruction = row.full_instruction

  actual_answer["answer_start"] = [full_instruction.find(answer)]
  return actual_answer

def make_final_data(dataset_type):
  combine_data = combine_text_visual_df(dataset_type) #replace datasettype
  combine_data['full_instruction'] = combine_data.apply(lambda row: combine_body_with_step(row), axis=1)
  combine_data['new_question'] = combine_data.apply(lambda row: generate_questions(row), axis=1)
  combine_data['actual_answer'] = combine_data.apply(lambda row: generate_answer_and_index(row), axis=1)
  dup_check = combine_data[['recipe_id', 'full_instruction', 'new_question', 'answer']]
  combine_data = combine_data[dup_check.duplicated() == False].reset_index(drop=True)
  final_data = combine_data[['recipe_id', 'full_instruction', 'new_question', 'actual_answer']].reset_index()
  final_data.rename(columns={'index':'id', 'recipe_id':'title', 'full_instruction':'context', 'new_question':'question', 'actual_answer':'answers'}, inplace=True)
  return final_data

In [5]:
train_df = make_final_data('train')
val_df = make_final_data('val')
test_df = make_final_data('test')

In [6]:
train = Dataset.from_pandas(train_df)
val = Dataset.from_pandas(val_df)
test = Dataset.from_pandas(test_df) 

full_dataset = DatasetDict({'train': train, 'val': val, 'test': test})

In [7]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5597
    })
    val: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 645
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 686
    })
})

In [None]:
#everyone can make a copy and take it from here to play around with qa modeling