In [None]:
!pip install milvus_model
!pip install pymilvus

In [2]:
import kagglehub
import pandas as pd
from pymilvus import model, MilvusClient
from numpy import dot
from numpy.linalg import norm
from multiprocessing import Pool
import os
from functools import partial
from random import sample
import numpy as np
from openai import OpenAI
from tqdm.auto import tqdm

# Data:

In [3]:
# Download the movie dataset
path = kagglehub.dataset_download("jrobischon/wikipedia-movie-plots")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/jrobischon/wikipedia-movie-plots/versions/1


In [4]:
#examin the data
df = pd.read_csv('/root/.cache/kagglehub/datasets/jrobischon/wikipedia-movie-plots/versions/1/wiki_movie_plots_deduped.csv')
print("length of the data: ",len(df))
df.head()

length of the data:  34886


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [5]:
# taking only 300 movies and only keeping title and plot for simplicity
dataset = df[0:300]
dataset = "movie title: " + dataset["Title"] + " " + "movie plot: " + dataset["Plot"]
print(dataset)

0      movie title: Kansas Saloon Smashers movie plot...
1      movie title: Love by the Light of the Moon mov...
2      movie title: The Martyred Presidents movie plo...
3      movie title: Terrible Teddy, the Grizzly King ...
4      movie title: Jack and the Beanstalk movie plot...
                             ...                        
295    movie title: A Society Exile movie plot: Based...
296    movie title: Soldiers of Fortune movie plot: R...
297    movie title: Sunnyside movie plot: Charlie wor...
298    movie title: The Test of Honor movie plot: Mar...
299    movie title: True Heart Susie movie plot: As d...
Length: 300, dtype: object


# Model:

In [6]:
#embedding model
openai_key = 'replace with yours'
ef = model.dense.OpenAIEmbeddingFunction(
   model_name="text-embedding-3-small",
   api_key= openai_key)
client = OpenAI(api_key=openai_key)
USER_STR = "user"
SYSTEM_STR = "system"
MSG_STR = "content"


In [7]:
def prompt2embedding(data, model):
  embeddings = []
  for i in range(len(data)):
    embeddings.append(model.encode_documents([data[i]]))
  return embeddings

# convert the dataset into embeddings
embeddings = prompt2embedding(dataset, ef)
embeddings = np.array(embeddings)
embeddings = embeddings.squeeze(1)
embeddings.shape

(300, 1536)

In [10]:
def cosine_similarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

def get_chat_history(shots, dataset, dataset_embeddings, message, model):
  chat_history = []
  system_message = {
    "role": "system",
    "content": f"""You are a content-based recommendation system. Your task is to process a short text description of a user's movie preferences
                  and compare it against a dataset of movies (each with a title and a plot summary or keywords). Based on this comparison, you will
                  return the top 3–5 movies that best match the user's input. return only the title of the movie"""
                  }
  chat_history.append(system_message)
  if shots <= 0:
      return chat_history

  # convert the user prompt to embedding
  message_embeddings = prompt2embedding([message], model)
  message_embeddings = np.array(message_embeddings)
  message_embeddings = message_embeddings.squeeze(0)

  # compare the user propmt embedding to the dataset embeddings and return the index of the k highest scores, k is the numbure of shots
  with Pool(os.cpu_count()) as pool:
        temps = pool.map(partial(cosine_similarity, b=message_embeddings[0]), dataset_embeddings)
  temps = np.argsort(temps)[-shots:][::-1]

  # append the selected shots to the system message
  for i in temps:
        system_message = {
            "role": "system",
            "content": f"{dataset[i]}"
        }
        chat_history.append(system_message)

  return chat_history

In [11]:
get_chat_history(2, dataset, embeddings, "I love thrilling action movies set in space, with a comedic twist.", ef)

[{'role': 'system',
  'content': "You are a content-based recommendation system. Your task is to process a short text description of a user's movie preferences \n                  and compare it against a dataset of movies (each with a title and a plot summary or keywords). Based on this comparison, you will \n                  return the top 3–5 movies that best match the user's input. return only the title of the movie"},
 {'role': 'system',
  'content': 'movie title: Love, Loot and Crash movie plot: Dora and her father are lost in the kitchen (they have just fired their cook). An ad for new one in the newspaper attracts two crooks (one of which is Fritz Schade). He dresses like a woman to apply for the job. At his first opportunity he plans to loot the house, but just then, a cop on the beat stops in for coffee. Fritz locks the cop in the basement, picks up what things of value he can and escapes. He and his pal drive off in a Model T. Along the way Dora is kidnapped, the Keystone C

In [12]:
def call_api_openai(shots, example):
    success = False
    while not success:
        try:
            message = {'role': USER_STR, 'content': example}
            chat_history = get_chat_history(shots, dataset, embeddings, example, ef)
            chat_history.append(message)
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.0,
                messages=chat_history
            )
            success = 1
        except Exception as err:
            tqdm.write(f"Caught exception: {err}")
    return response.choices[0].message.content

In [14]:
text = "I love thrilling action movies set in space, with a comedic twist."
call_api_openai(10, text)

'1. A Modern Musketeer\n2. Love, Loot and Crash\n3. Caught in the Rain'