In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from utils import augment_string
from random import shuffle
from tqdm import tqdm
tqdm.pandas()

In [None]:
cohere = pd.read_feather("datasets/movie_datasets/imdb/movies_qa_cohere.feather")
palm2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")
palm2_p2 = pd.read_feather("datasets/movie_datasets/imdb/movie_qa.feather")

In [None]:
print(cohere.columns)
print(palm2.columns)
print(palm2_p2.columns)

Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')
Index(['tconst', 'originalTitle', 'data', 'question', 'answer'], dtype='object')


In [None]:
complete_movie_qa = pd.concat([cohere, palm2, palm2_p2], axis=0, ignore_index=True)

In [None]:
complete_movie_qa.head()

Unnamed: 0,tconst,originalTitle,data,question,answer
0,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the rating of the movie Satyarthi?,8.3
1,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the release year of the movie Satyarthi?,2019
2,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the genre of the movie Satyarthi?,Documentary
3,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,Who is the movie Satyarthi based on?,Kailash Satyarthi
4,tt10024036,Satyarthi,Description: Kailash Satyarthi has spent his l...,What is the vote count of the movie Satyarthi?,6.0


In [None]:
train_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response: {answer}
""".strip()
test_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response:
""".strip()

In [None]:
sample = complete_movie_qa.iloc[10]

In [None]:
train_prompt.format(question=sample["question"].strip(), description=sample["data"].strip(), answer=sample["answer"].strip())

'Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.\n###Instruction: What award was the movie Satyarthi nominated for?\n###Input: Description: Kailash Satyarthi has spent his lifetime rescuing kids from slavery, and was awarded the Nobel Peace Prize for his work in 2014. The film looks at his journey and goes behind the scenes to witness first hand how he has changed lives of thousands of children. \nRelease Year: 2019 \nRuntime(in minutes): 52 \nGenre: Documentary \nRating: 8.3 \nVotes: 6.0\n###Response: The Nobel Peace Prize'

In [None]:
complete_movie_qa = complete_movie_qa.sample(frac=1, ignore_index=True)

In [None]:
complete_movie_qa.head()

Unnamed: 0,tconst,originalTitle,data,question,answer
0,tt10177712,"Alexis Viera, una historia de superación","Description: Alexis Viera, una historia de sup...",Who is the director of the movie?,Santiago Perez Bernal
1,tt11736782,The Mill,Description: The 2020 deadline to close the Bo...,What is the name of the director of the movie?,Ken MacDonald
2,tt11078522,Psycho-Pass: Sinners of the System Case.3 - On...,Description: Following the incident in the Sou...,"How many votes does ""Psycho-Pass: Sinners of t...",819.0
3,tt10278386,Witches of the Water,Description: A young man falls in love with a ...,What is the genre of the movie?,"Drama, Horror"
4,tt28378149,Yami Douga 18,"Description: 18th installment in the ""Yami Dou...",In what year was the movie released?,2018


In [None]:
complete_movie_qa.shape

(73245, 5)

In [None]:
unseen_titles = pd.Series(complete_movie_qa["originalTitle"].unique()).sample(500).to_list()

In [None]:
shuffle(unseen_titles)

In [None]:
val_titles = unseen_titles[:250]
test_titles = unseen_titles[250:]

In [None]:
train_ds = complete_movie_qa[~(complete_movie_qa["originalTitle"].isin(unseen_titles))].sample(frac=1, ignore_index=True)
val_ds = complete_movie_qa[complete_movie_qa["originalTitle"].isin(val_titles)].sample(frac=1, ignore_index=True)
test_ds = complete_movie_qa[complete_movie_qa["originalTitle"].isin(test_titles)].sample(frac=1, ignore_index=True)

In [None]:
print(train_ds.shape)
print(val_ds.shape)
print(test_ds.shape)

(63921, 5)
(4564, 5)
(4760, 5)


In [None]:
noisy_samples = train_ds.sample(frac=.20)

In [None]:
for i, row in tqdm(noisy_samples.iterrows(), total=noisy_samples.shape[0]):
    train_ds.at[i, "question"] = augment_string(row["question"])

100%|██████████████████████████████████████████████████████████████████████████| 12784/12784 [00:11<00:00, 1065.70it/s]


In [None]:
train_ds["prompt"] = train_ds.progress_apply(lambda row: train_prompt.format(question=row["question"].strip(), description=row["data"].strip(), answer=row["answer"].strip()), axis=1)
val_ds["prompt"] = val_ds.progress_apply(lambda row: train_prompt.format(question=row["question"].strip(), description=row["data"].strip(), answer=row["answer"].strip()), axis=1)
test_ds["prompt"] = test_ds.progress_apply(lambda row: test_prompt.format(question=row["question"].strip(), description=row["data"].strip()), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 63921/63921 [00:00<00:00, 96227.45it/s]
100%|███████████████████████████████████████████████████████████████████████████| 4564/4564 [00:00<00:00, 84505.99it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4760/4760 [00:00<00:00, 123470.22it/s]


In [None]:
train_ds.to_feather("datasets/movie_datasets/imdb/train_llm_ds_v1.feather")
val_ds.to_feather("datasets/movie_datasets/imdb/val_llm_ds_v1.feather")
test_ds.to_feather("datasets/movie_datasets/imdb/test_llm_ds_v1.feather")
complete_movie_qa.to_feather("datasets/movie_datasets/imdb/movies_ds_v1.feather")

# Reco and QA data

In [1]:
import pandas as pd
import numpy as np
from random import choice, shuffle, sample
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
tqdm.pandas()

In [2]:
model = SentenceTransformer("reco_output/sentence-transformers-all-MiniLM-L6-v2/")

In [3]:
movies_df = pd.read_csv("./datasets/movie_datasets/imdb/movie_complete_clean.csv")

In [4]:
movies_df.head()

Unnamed: 0,tconst,originalTitle,startYear,genres,runtimeMinutes,averageRating,numVotes
0,tt0000009,Miss Jerry,1894,Romance,45,5.3,207.0
1,tt0000147,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",100,5.3,484.0
2,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",70,6.0,855.0
3,tt0000591,L'enfant prodigue,1907,Drama,90,5.0,21.0
4,tt0000615,Robbery Under Arms,1907,Drama,\N,4.3,25.0


In [5]:
def generate_embeddings(string, show_progress_bar=True):
    return model.encode(string, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=show_progress_bar)

In [6]:
search_text = lambda details: f'The movie {details["originalTitle"]} was released in the year {details["startYear"]} is a {details["genres"]} with a  runtime of {details["runtimeMinutes"]} minutes' 
movie_text = lambda details: f'{details["originalTitle"]}  ({details["startYear"]})' 

In [7]:
movies_df["search_text"] = movies_df.apply(search_text, axis=1) 
movies_df["movie_text"] = movies_df.apply(movie_text, axis=1) 

In [8]:
recommendation_array = np.load("./datasets/movie_datasets/imdb/recommendation_array.npy", allow_pickle=True)

In [9]:
recommendation_array[:3]

array([list(['tt2582802', 'tt1285016', 'tt1504320', 'tt0947798', 'tt2562232', 'tt2278388', 'tt3783958', 'tt5726616']),
       list(['tt1457767', 'tt3065204', 'tt5814060', 'tt3322940', 'tt5140878']),
       list(['tt0293849', 'tt0065359', 'tt5712474', 'tt2106565', 'tt8224882', 'tt2208192'])],
      dtype=object)

In [10]:
movie_and_recommendation = []
for recommendation in recommendation_array:
    data  = {}
    data["movie"] = choice(recommendation)
    recommendation.remove(data["movie"])
    data["recommendation"] = recommendation
    movie_and_recommendation.append(data)

In [11]:
tconst_movies = movies_df.set_index('tconst').copy()

In [12]:
# movies_reco_embeddings = generate_embeddings(movies_df["search_text"].to_list())

In [13]:
# np.save("datasets/movie_datasets/imdb/movie_reco_embeddings.npy", movies_reco_embeddings.detach().cpu().numpy())

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
movies_reco_embeddings = np.load("datasets/movie_datasets/imdb/movie_reco_embeddings.npy")
movies_reco_embeddings = torch.from_numpy(movies_reco_embeddings).to(device)

In [16]:
shuffle(movie_and_recommendation)

In [18]:
train_rec = movie_and_recommendation[:15_000]
val_rec = movie_and_recommendation[15_000:17_000]
test_rec = movie_and_recommendation[17_000:]


In [40]:
def get_top_k_movies(anchor_tconst):
    anchor_movie_search_text = tconst_movies.loc[anchor_tconst]["search_text"]
    anchor_movie_emb = generate_embeddings(anchor_movie_search_text, show_progress_bar=False)
    top_k =  [movies_df["tconst"].iloc[score["corpus_id"]] for score in util.semantic_search(anchor_movie_emb, movies_reco_embeddings, top_k=25)[0]]
    try:
        top_k.remove(anchor_tconst)
    except ValueError:
        pass
    return top_k[:20]
        # print(movies_df["originalTitle"].iloc[score["corpus_id"]], movies_df["tconst"].iloc[score["corpus_id"]])

In [41]:
train_recommendation_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Given the movie {movie_title}, recommend {count} similar movies from the input movies
###Input: {input_titles}
###Response: {recommended_movies}
""".strip()
test_recommendation_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Given the movie in {movie_title}, recommend {count} similar movies for the input
###Input: {input_titles}
###Response:
""".strip()

In [42]:
def generate_prompt_from_tconst(movie_tconst, input_tconsts, reco_count, reco_tconsts=None):
    movie_name = tconst_movies["movie_text"].loc[movie_tconst]
    input_movie_names  = "\n".join([tconst_movies["movie_text"].loc[input_tconst] for input_tconst in input_tconsts])
    if reco_tconsts:
        reco_movie_names  = "\n".join([tconst_movies["movie_text"].loc[reco_tconst] for reco_tconst in reco_tconsts])
        return train_recommendation_prompt.format(movie_title=movie_name, count=reco_count, input_titles= input_movie_names, recommended_movies=reco_movie_names)
    else:
        return test_recommendation_prompt.format(movie_title=movie_name, count=reco_count, input_titles= input_movie_names)


In [43]:
def generate_prompt(dataset, is_test=False):
    for mr in  tqdm(dataset):
        try:
            reco_count = choice(range(1, len(mr["recommendation"])+1))
            reco_movies = sample(mr["recommendation"], reco_count)
            top_k_rec = get_top_k_movies(mr["movie"])
            dissimilar_movies = list(set(top_k_rec).symmetric_difference(reco_movies))
            top_k_list = sample(dissimilar_movies, 20 - len(reco_movies)) + reco_movies
            shuffle(top_k_list)
            if is_test:
                mr["prompt"] = generate_prompt_from_tconst(mr["movie"], top_k_list, reco_count)
            else:
                mr["prompt"] = generate_prompt_from_tconst(mr["movie"], top_k_list, reco_count, reco_movies)
        except Exception:
            pass
    return dataset

In [44]:
train_rec_ds = generate_prompt(train_rec)
val_rec_ds = generate_prompt(val_rec)

100%|██████████| 15000/15000 [07:00<00:00, 35.65it/s]
100%|██████████| 2000/2000 [00:55<00:00, 35.82it/s]


In [45]:
test_rec_ds = generate_prompt(test_rec, is_test=True)

100%|██████████| 2149/2149 [01:00<00:00, 35.60it/s]


In [46]:
print(train_rec_ds[0]["prompt"])

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Given the movie Coraline  (2009), recommend 3 similar movies from the input movies
###Input: Despicable Me  (2010)
Ratatouille  (2007)
California Dreamers  (2013)
Far from Home: The Adventures of Yellow Dog  (1995)
Rock Dog  (2016)
Wonder Park  (2019)
Freddy's Dead: The Final Nightmare  (1991)
Louisa: An Amazing Adventure  (2021)
Inside Out  (2015)
Polar Adventure  (2015)
A Nightmare on Elm Street: The Dream Child  (1989)
The Frogville  (2014)
The Enchanted Mountain  (2008)
The Velveteen Rabbit  (2009)
A Nightmare on Elm Street: The Dream Child  (1989)
The Nightmare Emporium  (2017)
Kubo and the Two Strings  (2016)
Freddy's Dead: The Final Nightmare  (1991)
Star Paws  (2016)
Days of Solitude  (2018)
###Response: The Nightmare Emporium  (2017)
Freddy's Dead: The Final Nightmare  (1991)
A Nightmare on Elm Street: The

In [32]:
print(test_rec_ds[0]["prompt"])

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Given the movie in Helmet  (2021), recommend 4 similar movies for the input
###Input: Alagogo Ide  (2021)
Mo jing  (2015)
Pouding chômeurs  (2015)
A sírásó  (2010)
Territory of Succes  (2017)
Helmet  (2021)
The Breadwinner  (2017)
Eumlangwaoe  (2016)
The Eceti Ranch with James Gilliland  (2015)
After the Murder of Albert Lima  (2019)
Chaco  (2020)
Tomoko: Mottomo kiken'na on'na  (2000)
The Shoplifters  (2019)
Guang  (2018)
Minari  (2020)
The Farewell  (2019)
The Breadwinner  (2017)
From Ashes  (2015)
Lunga  (2023)
Tengo Miedo Torero  (2020)
###Response:


In [98]:
# pd.DataFrame.from_dict(train_rec_ds).to_csv("./datasets/movie_datasets/imdb/train_rec_llm.csv", index=False)
# pd.DataFrame.from_dict(val_rec_ds).to_csv("./datasets/movie_datasets/imdb/val_rec_llm.csv", index=False)
# pd.DataFrame.from_dict(test_rec_ds).to_csv("./datasets/movie_datasets/imdb/test_rec_llm.csv", index=False)

In [47]:
from transformers import AutoTokenizer

In [48]:
tokenizer = AutoTokenizer.from_pretrained(
            "HuggingFaceH4/zephyr-7b-beta",
            token="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",
        )

In [55]:
train_rec_ds[0]

{'movie': 'tt0327597',
 'recommendation': ['tt8248878',
  'tt0107688',
  'tt0087800',
  'tt0089686',
  'tt0093629',
  'tt0095742',
  'tt0097981',
  'tt0101917',
  'tt0329101'],
 'prompt': "Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.\n###Instruction: Given the movie Coraline  (2009), recommend 3 similar movies from the input movies\n###Input: Despicable Me  (2010)\nRatatouille  (2007)\nCalifornia Dreamers  (2013)\nFar from Home: The Adventures of Yellow Dog  (1995)\nRock Dog  (2016)\nWonder Park  (2019)\nFreddy's Dead: The Final Nightmare  (1991)\nLouisa: An Amazing Adventure  (2021)\nInside Out  (2015)\nPolar Adventure  (2015)\nA Nightmare on Elm Street: The Dream Child  (1989)\nThe Frogville  (2014)\nThe Enchanted Mountain  (2008)\nThe Velveteen Rabbit  (2009)\nA Nightmare on Elm Street: The Dream Child  (1989)\nThe Nightmare Emporium  (2017)\nKubo and the Two Strings  (20

In [59]:
train_rec_ds[0]["prompt"]

"Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.\n###Instruction: Given the movie Coraline  (2009), recommend 3 similar movies from the input movies\n###Input: Despicable Me  (2010)\nRatatouille  (2007)\nCalifornia Dreamers  (2013)\nFar from Home: The Adventures of Yellow Dog  (1995)\nRock Dog  (2016)\nWonder Park  (2019)\nFreddy's Dead: The Final Nightmare  (1991)\nLouisa: An Amazing Adventure  (2021)\nInside Out  (2015)\nPolar Adventure  (2015)\nA Nightmare on Elm Street: The Dream Child  (1989)\nThe Frogville  (2014)\nThe Enchanted Mountain  (2008)\nThe Velveteen Rabbit  (2009)\nA Nightmare on Elm Street: The Dream Child  (1989)\nThe Nightmare Emporium  (2017)\nKubo and the Two Strings  (2016)\nFreddy's Dead: The Final Nightmare  (1991)\nStar Paws  (2016)\nDays of Solitude  (2018)\n###Response: The Nightmare Emporium  (2017)\nFreddy's Dead: The Final Nightmare  (1991)\nA Nig

In [60]:
max(map(len, tokenizer([record.get("prompt", "") for record in train_rec_ds])["input_ids"]))

815

In [61]:
qa_df = pd.read_csv("./datasets/movie_datasets/imdb/palm2_prompt3_clean_qa.csv")

In [62]:
qa_df.head()

Unnamed: 0,tconst,originalTitle,data,question,answer
0,tt14270944,"Da lontano, più forte","Description: Images, sounds, pictures from the...",Who are the voice-overs in the movie “Da lonta...,"The voice-overs in the movie “Da lontano, più ..."
1,tt14766668,Michelle Obama: Hope Becomes Change,Description: When Michelle Obama became First ...,In the movie Michelle Obama: Hope Becomes Chan...,In the movie Michelle Obama: Hope Becomes Chan...
2,tt10146204,Doberman,Description: A nap afternoon in the outskirts ...,Doberman is a 2019 drama that tells the story ...,How long is the 2019 drama movie Doberman?
3,tt10133680,Kasaai,Description: A mother struggles to get justice...,What is the setting of the movie Kasaai?,The setting of the movie Kasaai is a village i...
4,tt10740556,Kapit,Description: KAPIT (2019) is a psychological a...,What are the names of the 3 high school studen...,The 3 high school students who battle with the...


In [63]:
# qa_df["description"] = qa_df["data"].str.split("\n").apply(lambda x: x[0].replace("Description:", "")).str.strip().str.lower()

In [64]:
# qa_df[qa_df["description"]==qa_df["answer"].str.lower()]

In [65]:
# qa_df["description_unique_words"] = qa_df["description"].apply(lambda desc: set(map(lambda word: word.strip(), desc.split(" "))))
# qa_df["answer_unique_words"] = qa_df["answer"].apply(lambda ans: set(map(lambda word: word.strip(), ans.split(" "))))

In [66]:
# qa_df["overlap"] = qa_df.apply(lambda row: len(row["description_unique_words"].intersection(row["answer_unique_words"]))/len((row["description_unique_words"])), axis=1)

In [67]:
qa_df.shape

(11344, 5)

In [68]:
train_qa_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response: {answer}
""".strip()
test_qa_prompt="""
Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: {question}
###Input: {description}
###Response:
""".strip()

In [69]:
qa_df = qa_df.sample(frac=1).reset_index(drop=True)

In [70]:
train_qa = qa_df.iloc[:6000].reset_index(drop=True)
val_qa = qa_df.iloc[6000:8000].reset_index(drop=True)
test_qa = qa_df.iloc[8000:].reset_index(drop=True)

In [71]:
train_qa["prompt"] = train_qa.apply(lambda row: train_qa_prompt.format(question=row["question"], description=row["data"], answer=row["answer"]), axis=1)
val_qa["prompt"] = val_qa.apply(lambda row: train_qa_prompt.format(question=row["question"], description=row["data"], answer=row["answer"]), axis=1)
test_qa["prompt"] = train_qa.apply(lambda row: test_qa_prompt.format(question=row["question"], description=row["data"]), axis=1)

In [72]:
print(train_qa["prompt"].iloc[10])

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Who are the runts of the litter in the movie Runts?
###Input: Description: Cyrus Price and his friends are the runts of the litter, and always have been. Bella (known as Bluebell to her friends) has an unpleasant past and most of her family is in jail, Avery's a killer dancer but can't catch a ball to save his life and is teased for it daily, Abcd'e (affectionately nicknamed Alphabet, even though it's ironic) is quite intelligent, but being severely dyslexic, she can't read a thing. Alexander's adopted and while he's got a great adoptive family with plenty of other adopted siblings, sometimes his anger at his biological parents threatens to take over. Cyrus has already got an alcoholic mom, but when his brother Ben suddenly can't be found one morning, his journal shoved under his bed reveals that he may have been k

In [73]:
print(test_qa["prompt"].iloc[10])

Below is a question regarding movies and shows paired with an input that provides further context. Write a response that appropriately completes the request.
###Instruction: Who are the runts of the litter in the movie Runts?
###Input: Description: Cyrus Price and his friends are the runts of the litter, and always have been. Bella (known as Bluebell to her friends) has an unpleasant past and most of her family is in jail, Avery's a killer dancer but can't catch a ball to save his life and is teased for it daily, Abcd'e (affectionately nicknamed Alphabet, even though it's ironic) is quite intelligent, but being severely dyslexic, she can't read a thing. Alexander's adopted and while he's got a great adoptive family with plenty of other adopted siblings, sometimes his anger at his biological parents threatens to take over. Cyrus has already got an alcoholic mom, but when his brother Ben suddenly can't be found one morning, his journal shoved under his bed reveals that he may have been k

In [75]:
train_ds = train_qa["prompt"].to_list() + [record.get("prompt", "") for record in train_rec_ds]
val_ds = val_qa["prompt"].to_list() + [record.get("prompt", "") for record in val_rec_ds]
test_ds = test_qa["prompt"].to_list() + [record.get("prompt", "") for record in test_rec_ds]

In [96]:
# train_qa.to_csv("./datasets/movie_datasets/imdb/train_qa.csv", index=False)
# val_qa.to_csv("./datasets/movie_datasets/imdb/val_qa.csv", index=False)
# test_qa.to_csv("./datasets/movie_datasets/imdb/test_qa.csv", index=False)

In [77]:
shuffle(train_ds)
shuffle(val_ds)
shuffle(test_ds)

In [78]:
train_ds_vsm = train_ds[:2000]
train_ds_sm = train_ds[:5000]
train_ds_md = train_ds[:10_000]
train_ds_lg = train_ds

In [87]:
pd.DataFrame({"prompt": train_ds_vsm}).to_csv("./datasets/movie_datasets/imdb/train_llm_ds_vsm_v2.csv", index=False)
pd.DataFrame({"prompt": train_ds_sm}).to_csv("./datasets/movie_datasets/imdb/train_llm_ds_sm_v2.csv", index=False)
pd.DataFrame({"prompt": train_ds_md}).to_csv("./datasets/movie_datasets/imdb/train_llm_ds_md_v2.csv", index=False)
pd.DataFrame({"prompt": train_ds_lg}).to_csv("./datasets/movie_datasets/imdb/train_llm_ds_lg_v2.csv", index=False)

In [88]:
len(val_ds)

4000

In [89]:
val_ds_vsm = val_ds[:500]
val_ds_sm = val_ds[:1000]
val_ds_md = val_ds[:2_000]
val_ds_lg = val_ds

In [90]:
pd.DataFrame({"prompt": val_ds_vsm}).to_csv("./datasets/movie_datasets/imdb/val_llm_ds_vsm_v2.csv", index=False)
pd.DataFrame({"prompt": val_ds_sm}).to_csv("./datasets/movie_datasets/imdb/val_llm_ds_sm_v2.csv", index=False)
pd.DataFrame({"prompt": val_ds_md}).to_csv("./datasets/movie_datasets/imdb/val_llm_ds_md_v2.csv", index=False)
pd.DataFrame({"prompt": val_ds_lg}).to_csv("./datasets/movie_datasets/imdb/val_llm_ds_lg_v2.csv", index=False)

In [91]:
len(test_ds)

5493

In [92]:
test_ds_vsm = test_ds[:500]
test_ds_sm = test_ds[:1000]
test_ds_md = test_ds[:2_000]
test_ds_lg = test_ds

In [93]:
pd.DataFrame({"prompt": test_ds_vsm}).to_csv("./datasets/movie_datasets/imdb/test_llm_ds_vsm_v2.csv", index=False)
pd.DataFrame({"prompt": test_ds_sm}).to_csv("./datasets/movie_datasets/imdb/test_llm_ds_sm_v2.csv", index=False)
pd.DataFrame({"prompt": test_ds_md}).to_csv("./datasets/movie_datasets/imdb/test_llm_ds_md_v2.csv", index=False)
pd.DataFrame({"prompt": test_ds_lg}).to_csv("./datasets/movie_datasets/imdb/test_llm_ds_lg_v2.csv", index=False)