In [21]:
import pandas as pd
from utils import augment_string
from random import shuffle
from tqdm import tqdm
tqdm.pandas()

In [2]:
cohere = pd.read_feather("datasets/movie_datasets/imdb/movies_qa_cohere.feather")
palm2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")
palm2_p2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")

In [3]:
print(cohere.columns)
print(palm2.columns)
print(palm2_p2.columns)

Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')


In [4]:
complete_movie_qa = pd.concat([cohere, palm2, palm2_p2], axis=0, ignore_index=True)

In [5]:
complete_movie_qa.head()

Unnamed: 0,tconst,originalTitle,data,question,answer
0,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the rating of the movie Satyarthi?,8.3
1,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the release year of the movie Satyarthi?,2019
2,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the genre of the movie Satyarthi?,Documentary
3,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,Who is the movie Satyarthi based on?,Kailash Satyarthi
4,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the vote count of the movie Satyarthi?,6.0


In [6]:
train_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response: {answer}
""".strip()
test_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response:
""".strip()

In [7]:
sample = complete_movie_qa.iloc[10]

In [9]:
train_prompt.format(question=sample["question"].strip(), description=sample["data"].strip(), answer=sample["answer"].strip())

'Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.\n###Instruction: What award was the movie Satyarthi nominated for?\n###Input: Description: Kailash Satyarthi has spent his lifetime rescuing kids from slavery, and was awarded the Nobel Peace Prize for his work in 2014. The film looks at his journey and goes behind the scenes to witness first hand how he has changed lives of thousands of children. \nRelease Year: 2019 \nRuntime(in minutes): 52 \nGenre: Documentary \nRating: 8.3 \nVotes: 6.0\n###Response: The Nobel Peace Prize'

In [10]:
complete_movie_qa = complete_movie_qa.sample(frac=1, ignore_index=True)

In [11]:
complete_movie_qa.head()

Unnamed: 0,tconst,originalTitle,data,question,answer
0,tt10177712,"Alexis Viera, una historia de superación","Description: Alexis Viera, una historia de sup...",Who is the director of the movie?,Santiago Perez Bernal
1,tt11736782,The Mill,Description: The 2020 deadline to close the Bo...,What is the name of the director of the movie?,Ken MacDonald
2,tt11078522,Psycho-Pass: Sinners of the System Case.3 - On...,Description: Following the incident in the Sou...,"How many votes does ""Psycho-Pass: Sinners of t...",819.0
3,tt10278386,Witches of the Water,Description: A young man falls in love with a ...,What is the genre of the movie?,"Drama, Horror"
4,tt28378149,Yami Douga 18,"Description: 18th installment in the ""Yami Dou...",In what year was the movie released?,2018


In [12]:
complete_movie_qa.shape

(73245, 5)

In [13]:
unseen_titles = pd.Series(complete_movie_qa["originalTitle"].unique()).sample(500).to_list()

In [14]:
shuffle(unseen_titles)

In [15]:
val_titles = unseen_titles[:250]
test_titles = unseen_titles[250:]

In [16]:
train_ds = complete_movie_qa[~(complete_movie_qa["originalTitle"].isin(unseen_titles))].sample(frac=1, ignore_index=True)
val_ds = complete_movie_qa[complete_movie_qa["originalTitle"].isin(val_titles)].sample(frac=1, ignore_index=True)
test_ds = complete_movie_qa[complete_movie_qa["originalTitle"].isin(test_titles)].sample(frac=1, ignore_index=True)

In [17]:
print(train_ds.shape)
print(val_ds.shape)
print(test_ds.shape)

(63921, 5)
(4564, 5)
(4760, 5)


In [18]:
noisy_samples = train_ds.sample(frac=.20)

In [19]:
for i, row in tqdm(noisy_samples.iterrows(), total=noisy_samples.shape[0]):
    train_ds.at[i, "question"] = augment_string(row["question"])

100%|██████████████████████████████████████████████████████████████████████████| 12784/12784 [00:11<00:00, 1065.70it/s]


In [22]:
train_ds["prompt"] = train_ds.progress_apply(lambda row: train_prompt.format(question=row["question"].strip(), description=row["data"].strip(), answer=row["answer"].strip()), axis=1)
val_ds["prompt"] = val_ds.progress_apply(lambda row: train_prompt.format(question=row["question"].strip(), description=row["data"].strip(), answer=row["answer"].strip()), axis=1)
test_ds["prompt"] = test_ds.progress_apply(lambda row: test_prompt.format(question=row["question"].strip(), description=row["data"].strip()), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 63921/63921 [00:00<00:00, 96227.45it/s]
100%|███████████████████████████████████████████████████████████████████████████| 4564/4564 [00:00<00:00, 84505.99it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4760/4760 [00:00<00:00, 123470.22it/s]


In [24]:
train_ds.to_feather("datasets/movie_datasets/imdb/train_llm_ds_v1.feather")
val_ds.to_feather("datasets/movie_datasets/imdb/val_llm_ds_v1.feather")
test_ds.to_feather("datasets/movie_datasets/imdb/test_llm_ds_v1.feather")
complete_movie_qa.to_feather("datasets/movie_datasets/imdb/movies_ds_v1.feather")