In [None]:
# %pip install openai plotly

In [1]:
import os
from openai import AzureOpenAI

from recommenders.datasets import movielens

In [2]:
df = movielens.load_pandas_df(
    title_col='title',
    genres_col='genres',
)
df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 6.91kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


In [3]:
def clean_title(title):
    """Parse the title and change 'title, The (year)' to be 'The title (year)'"""
    i = title.find(", The (")
    if i > 0:
        year = title[-6:]
        title = f"The {title[:i]} {year}"
    return title

In [30]:
df['title'] = df['title'].apply(clean_title)
df['title'].unique()

array(['Kolya (1996)', 'L.A. Confidential (1997)', 'Heavyweights (1994)',
       ..., 'Girls Town (1996)', 'The Silence of the Palace (1994)',
       'Dadetown (1995)'], dtype=object)

In [14]:
from typing import Union, List
from tqdm.auto import tqdm

class OpenAIModel:
    def __init__(self, deployment_name):
        self.model = deployment_name  # OpenAI embedding model deployment name
        self.client = AzureOpenAI(
            api_key=os.getenv("OPENAI_API_KEY"),  
            api_version="2023-05-15",
            azure_endpoint=os.getenv("OPENAI_API_ENDPOINT"), 
        )

    def __call__(self, inputs: Union[str, List[str]]):
        is_str = False
        if isinstance(inputs, str):
            inputs = [inputs]
            is_str = True
        
        batch_size = 20  # this is current limit (https://learn.microsoft.com/en-us/answers/questions/1334800/batching-requests-in-azure-openai)
        n_batches = len(inputs) // batch_size + 1
        i_start = 0
        res = []
        for _ in tqdm(range(n_batches)):
            i_end = i_start + batch_size
            batch = inputs[i_start:i_end]
            res += [
                data.embedding for data in self.client.embeddings.create(
                    model=self.model,
                    input=batch,
                ).data
            ]
            i_start = i_end

        return res[0] if is_str else res

    def __repr__(self):
        return f"OpenAIModel(model={self.model})"

In [15]:
# To use OpenAI's embedding model, you need to create an account, get an API key,
# and store it in environment variables OPENAI_API_KEY and OPENAI_API_ENDPOINT.
embedding_model = OpenAIModel("jun-openai-ada")

In [16]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def search_docs(user_query_embedding, df, embedding_col, top_n=4):
    df["similarities"] = df[embedding_col].apply(lambda x: cosine_similarity(x, user_query_embedding))

    return (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )

In [17]:
# create a dataframe with a unique title
title_embeddings_df = df[['title']].drop_duplicates()
title_embeddings_df.reset_index(inplace=True, drop=True)
title_embeddings_df['title_embedding'] = embedding_model(title_embeddings_df['title'].tolist())

title_embeddings_df.head()

  0%|          | 0/84 [00:00<?, ?it/s]

Unnamed: 0,title,title_embedding
0,Kolya (1996),"[0.0032892634626477957, -0.022517412900924683,..."
1,L.A. Confidential (1997),"[0.015241828747093678, -0.013112935237586498, ..."
2,Heavyweights (1994),"[-0.005445745307952166, -0.047639526426792145,..."
3,Legends of the Fall (1994),"[-0.0030133079271763563, -0.03167612478137016,..."
4,Jackie Brown (1997),"[-0.006625025998800993, -0.038863442838191986,..."


In [18]:
# store embedding df into parquet so that we can reuse it later
title_embeddings_df.to_parquet('title_embeddings.parquet')

In [46]:
title_embeddings_df = pd.read_parquet('title_embeddings.parquet')

In [47]:
# add genres so that we color them in the plot
title_embeddings_df = title_embeddings_df.merge(
    df[['title', 'genres']].drop_duplicates(),
    on='title',
    how='left',
)

In [52]:
# use tsne and plotly to visualize the embeddings
import plotly.express as px
from sklearn.manifold import TSNE

embeddings = np.array(title_embeddings_df['title_embedding'].tolist())
embeddings_2d = TSNE(n_components=2, random_state=0).fit_transform(embeddings)

px.scatter(
    embeddings_2d, x=0, y=1, hover_name=title_embeddings_df['title'], color=title_embeddings_df['genres'],
    title="Movie Title Embeddings",
)



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [53]:
query = "Movies about childhood."

res = search_docs(embedding_model(query), title_embeddings_df, 'title_embedding', top_n=3)

  0%|          | 0/1 [00:00<?, ?it/s]

In [54]:
res

Unnamed: 0,title,title_embedding,genres,similarities
1219,The Little Princess (1939),"[-0.007742746267467737, -0.05224580690264702, ...",Children's|Drama,0.845682
730,Kids (1995),"[0.01046755351126194, -0.01780427061021328, -0...",Drama,0.845399
1244,Little Lord Fauntleroy (1936),"[-0.00991661474108696, -0.023266395553946495, ...",Drama,0.839722
