In [30]:
import pandas as pd
import regex as re
from random import choice, shuffle
from tqdm import tqdm
tqdm.pandas()

In [2]:
cohere = pd.read_feather("datasets/movie_datasets/imdb/movies_qa_cohere.feather")
palm2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")
palm2_p2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")

In [3]:
choice(['tconst', 'originalTitle', 'data', 'question', 'answer'])

'answer'

In [4]:
print(cohere.columns)
print(palm2.columns)
print(palm2_p2.columns)

Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')


In [5]:
complete_movie_qa = pd.concat([cohere, palm2, palm2_p2], axis=0, ignore_index=True)

In [6]:
complete_movie_qa = complete_movie_qa[~(complete_movie_qa["question"].str.lower().str.startswith("what is the name of the movie") | complete_movie_qa["question"].str.lower().str.startswith("what is the name of the flim"))].reset_index(drop=True)

In [7]:
complete_movie_qa["question_has_title"] =  complete_movie_qa.progress_apply(lambda row: bool(re.search(re.escape(row["originalTitle"].lower()), row["question"].lower())), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 70679/70679 [00:01<00:00, 40769.57it/s]


In [8]:
q_w_title = complete_movie_qa[complete_movie_qa["question_has_title"]].reset_index(drop=True)
q_wo_title = complete_movie_qa[~(complete_movie_qa["question_has_title"])].reset_index(drop=True)

In [9]:
q_wo_title.shape

(37877, 6)

In [10]:
for i, row in tqdm(q_wo_title[(q_wo_title.question.str.contains("the movie") | q_wo_title.question.str.contains("the film"))].iterrows()):
    title = choice([row["originalTitle"], row["originalTitle"].lower()])
    title_replacements = [f"*{title}*", f"`{title}`", f"'{title}'", f"{title}", f"{title}"]
    replacement_title = choice(title_replacements) 
    updated_question = re.sub("the movie", f"the movie {replacement_title}", row["question"])
    updated_question = re.sub("the film", f"the film {replacement_title}", updated_question)
    q_wo_title.at[i, "question"] = updated_question
    q_wo_title.at[i, "question_has_title"] = True

24255it [00:01, 13201.19it/s]


In [11]:
q_wo_title[q_wo_title["question_has_title"]]

Unnamed: 0,tconst,originalTitle,data,question,answer,question_has_title
0,tt11230106,Jasmine Star,Description: A determined 16-year-old with alb...,What genre is the movie 'Jasmine Star'?,"Drama, Family",True
1,tt11230106,Jasmine Star,Description: A determined 16-year-old with alb...,What is the release year of the movie Jasmine ...,2019,True
2,tt11230106,Jasmine Star,Description: A determined 16-year-old with alb...,What is the runtime of the movie `Jasmine Star`?,58 minutes,True
3,tt11230106,Jasmine Star,Description: A determined 16-year-old with alb...,What is the description of the movie jasmine s...,A determined 16-year-old with albinism makes i...,True
4,tt11230106,Jasmine Star,Description: A determined 16-year-old with alb...,Who are the main characters in the movie jasmi...,"Jasmine, her father, and her brother",True
...,...,...,...,...,...,...
37869,tt13893782,En Cenizas,Description: This film is based on the story o...,Who wrote the film `en cenizas`?,Nicolás Guerrero,True
37870,tt13893782,En Cenizas,Description: This film is based on the story o...,Who starred in the film `En Cenizas`?,"Juan Pablo Urrego, Adriana Silva, Santiago Mou...",True
37871,tt13893782,En Cenizas,Description: This film is based on the story o...,What awards did the film 'En Cenizas' win?,,True
37872,tt13893782,En Cenizas,Description: This film is based on the story o...,What was the critical reception of the film `e...,Positive,True


In [12]:
q_w_title = pd.concat([q_w_title, q_wo_title[q_wo_title["question_has_title"]]], axis=0, ignore_index=True)

In [13]:
q_w_title.to_feather("datasets/movie_datasets/imdb/complete_movie_qa_ds.feather")

## Dataset Augmentation

In [14]:
import nlpaug.augmenter.char as nac

In [15]:
sample = q_w_title.iloc[15].question

In [16]:
augs = choice([nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete")])
augmented_texts = augs.augment(sample, n=1)
print("Original:")
print(sample)
print("Augmented Texts:")
print(augmented_texts)

Original:
What is the rating of the movie `Cha Ma`?
Augmented Texts:
['Wath is the artign of the omive ` Cha Ma `?']


In [17]:
def augment_string(string):
    augs = choice([nac.KeyboardAug(), nac.RandomCharAug(action="insert"), nac.RandomCharAug(action="substitute"), nac.RandomCharAug(action="swap"), nac.RandomCharAug(action="delete")])
    return augs.augment(sample, n=1)[0]

# Sentence Transformer dataset

In [18]:
sbert_ds = q_w_title[["originalTitle", "question"]].reset_index(drop=True)
sbert_ds["label"] = 1

In [19]:
neg_samples = []
for _ in range(2):
    sbert_ds_neg = sbert_ds.copy()
    sbert_ds_neg["mapping"] = sbert_ds.apply(lambda row: (row["originalTitle"], row["question"]), axis=1)
    sbert_ds_neg["mapping"] = sbert_ds_neg["mapping"].sample(frac=1, ignore_index=True)
    sbert_ds_neg["question"] = sbert_ds_neg["mapping"].apply(lambda mapping: mapping[1])
    sbert_ds_neg["label"] = sbert_ds_neg.apply(lambda row: int(row["originalTitle"]==row["mapping"][0]), axis=1)
    sbert_ds_neg = sbert_ds_neg[sbert_ds_neg["label"]==0]
    neg_samples.append(sbert_ds_neg[["originalTitle", "question", "label"]])

In [20]:
sbert_ds = pd.concat([sbert_ds, *neg_samples], axis=0, ignore_index=True)

In [21]:
sbert_ds = sbert_ds.sample(frac=1, ignore_index=True)

In [29]:
unseen_titles = pd.Series(sbert_ds["originalTitle"].unique()).sample(200).to_list()

In [31]:
shuffle(unseen_titles)

In [32]:
val_titles = unseen_titles[:100]
test_titles = unseen_titles[100:]

In [34]:
train_ds = sbert_ds[~(sbert_ds["originalTitle"].isin(unseen_titles))].sample(frac=1, ignore_index=True)
val_ds = sbert_ds[sbert_ds["originalTitle"].isin(val_titles)].sample(frac=1, ignore_index=True)
test_ds = sbert_ds[sbert_ds["originalTitle"].isin(test_titles)].sample(frac=1, ignore_index=True)

In [38]:
print(train_ds.shape)
print(val_ds.shape)
print(test_ds.shape)

(163098, 3)
(4032, 3)
(4007, 3)


In [40]:
noisy_samples = train_ds.sample(frac=.20)

In [41]:
for i, row in tqdm(noisy_samples.iterrows(), total=noisy_samples.shape[0]):
    train_ds.at[i, "question"] = augment_string(row["question"])

100%|██████████████████████████████████████████████████████████████████████████| 32620/32620 [00:28<00:00, 1149.23it/s]


In [42]:
train_ds[["question", "originalTitle", "label"]].to_feather("datasets/movie_datasets/imdb/train_sbert_ds_v1.feather")
val_ds[["question", "originalTitle", "label"]].to_feather("datasets/movie_datasets/imdb/val_sbert_ds_v1.feather")
test_ds[["question", "originalTitle", "label"]].to_feather("datasets/movie_datasets/imdb/test_sbert_ds_v1.feather")
sbert_ds[["question", "originalTitle", "label"]].to_feather("datasets/movie_datasets/imdb/retreiver_ds_v1.feather")