In [6]:
import pandas as pd
import numpy as np

plot_embeddings = np.load('../raw_data/embedding_summary.npy')

In [33]:
df = pd.read_csv('../raw_data/movie_with_summary.csv')

In [8]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [9]:
full_plot_embeddings = np.load('../raw_data/embeddings_plot.npy')

In [57]:
df

Unnamed: 0.1,Unnamed: 0,title,plot_synopsis,gen_summary,plot_synopsis_pre,plot_synopsis_str,gen_summary_pre,summary_str
0,0,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","""Mr. Holland's Opus"" follows the life of a ded...","[glenn, holland, morning, person, anyones, sta...",glenn holland morning person anyones standard ...,"[mr, holland, opus, follows, life, dedicated, ...",mr holland opus follows life dedicated music t...
1,1,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","""Scarface"" follows Cuban immigrant Tony Montan...","[may, cuban, man, named, tony, montana, al, pa...",may cuban man named tony montana al pacino cla...,"[scarface, follows, cuban, immigrant, tony, mo...",scarface follows cuban immigrant tony montana ...
2,5,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"After her husband's sudden death, a grieving w...","[kyle, pratt, jodie, foster, propulsion, engin...",kyle pratt jodie foster propulsion engineer ba...,"[husband, sudden, death, grieving, widow, boar...",husband sudden death grieving widow board tran...
3,6,Little Caesar,Small-time Italian-American criminals Caesar E...,"""Little Caesar"" follows the rise and fall of a...","[smalltime, italianamerican, criminal, caesar,...",smalltime italianamerican criminal caesar enri...,"[little, caesar, follows, rise, fall, ambitiou...",little caesar follows rise fall ambitious gang...
4,7,Savages,The movie begins with a video being shot of me...,Two marijuana entrepreneurs in California are ...,"[movie, begin, video, shot, men, hand, tied, b...",movie begin video shot men hand tied behind ba...,"[two, marijuana, entrepreneur, california, for...",two marijuana entrepreneur california forced g...
...,...,...,...,...,...,...,...,...
6199,7969,Thunderheart,"During the early 1970s, FBI agent Ray Levoi is...",A young FBI agent is sent to investigate a mur...,"[early, fbi, agent, ray, levoi, assigned, aid,...",early fbi agent ray levoi assigned aid investi...,"[young, fbi, agent, sent, investigate, murder,...",young fbi agent sent investigate murder native...
6200,7970,One Night of Love,Opera singer Mary Barrett (Grace Moore) leaves...,"A young opera singer, Mary, falls in love with...","[opera, singer, mary, barrett, grace, moore, l...",opera singer mary barrett grace moore leaf stu...,"[young, opera, singer, mary, fall, love, vocal...",young opera singer mary fall love vocal coach ...
6201,7971,One False Move,"Three criminals, Ray, Pluto and Fantasia (Ray'...","In the film ""One False Move,"" a trio of crimin...","[three, criminal, ray, pluto, fantasia, ray, g...",three criminal ray pluto fantasia ray girlfrie...,"[film, one, false, move, trio, criminal, run, ...",film one false move trio criminal run law head...
6202,7972,Lucky Numbers,"In 1988 Russ Richards (John Travolta), the wea...","""Lucky Numbers"" follows the story of a TV weat...","[rus, richards, john, travolta, weatherman, ha...",rus richards john travolta weatherman harrisbu...,"[lucky, number, follows, story, tv, weatherman...",lucky number follows story tv weatherman consp...


In [10]:
# uses full embedded plot synopsis and bert embedded user prompt
def find_recommendation_emb_full(text):
    user_token = tokenizer(text, return_tensors="pt")
    user_outputs = model(**user_token)
    user_embedded = user_outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    cos_similarities_emb = cosine_similarity([user_embedded], full_plot_embeddings).flatten()
    similar_movies_emb = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities_emb})
    similar_movies_emb = similar_movies_emb.sort_values(by='similarity', ascending=False)
    top_20_recommendations_emb = similar_movies_emb.head(50)[['title', 'similarity']]
    emb_recommendations = f"Top 20 recommendations:\n{top_20_recommendations_emb.to_string(index=False)}"
    return top_20_recommendations_emb

In [11]:
# uses embedded short plot summary and bert embedded user prompt
def find_recommendation_emb(text):
    user_token = tokenizer(text, return_tensors="pt")
    user_outputs = model(**user_token)
    user_embedded = user_outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    cos_similarities_emb = cosine_similarity([user_embedded], plot_embeddings).flatten()
    similar_movies_emb = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities_emb})
    similar_movies_emb = similar_movies_emb.sort_values(by='similarity', ascending=False)
    top_20_recommendations_emb = similar_movies_emb.head(50)[['title', 'similarity']]
    emb_recommendations = f"Top 20 recommendations:\n{top_20_recommendations_emb.to_string(index=False)}"
    return top_20_recommendations_emb

In [12]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return words

In [None]:
# For full plot synopsis

In [35]:
# apply preprocess the the generated summaries
df['plot_synopsis_pre'] = df['plot_synopsis'].apply(preprocess)

In [36]:
# Add all the tokenized words into strings
df['plot_synopsis_str'] = df['plot_synopsis_pre'].apply(lambda x: ' '.join(map(str, x)))

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf_vectorizer.fit_transform(df['plot_synopsis_str'])

In [41]:
# use full plot synopsis and vectorized prompt
def find_recommendation(text):
    # Vectorise user input
    user_tf_idf_vector = tf_idf_vectorizer.transform([text])
    # Find similarities
    cos_similarities = cosine_similarity(user_tf_idf_vector, tf_idf_matrix).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_20_recommendations = similar_movies.head(20)[['title', 'similarity']]
    recommendations = f"Top 20 recommendations:\n{top_20_recommendations.to_string(index=False)}"
    return top_20_recommendations

In [27]:
# For short summary

In [37]:
df['gen_summary_pre'] = df['gen_summary'].apply(preprocess)

In [38]:
df['summary_str'] = df['gen_summary_pre'].apply(lambda x: ' '.join(map(str, x)))

In [42]:
tf_idf_vectorizer_new = TfidfVectorizer(stop_words='english')
tf_idf_matrix_new = tf_idf_vectorizer_new.fit_transform(df['summary_str'])

In [44]:
def find_recommendation_short(text):
    # Vectorise user input
    user_tf_idf_vector = tf_idf_vectorizer_new.transform([text])
    # Find similarities
    cos_similarities = cosine_similarity(user_tf_idf_vector, tf_idf_matrix_new).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_20_recommendations = similar_movies.head(20)[['title', 'similarity']]
    recommendations = f"Top 20 recommendations:\n{top_20_recommendations.to_string(index=False)}"
    return top_20_recommendations

In [21]:
df_test = pd.read_csv('../raw_data/testing_sample.csv')

In [51]:
def evaluate_model(df, model):
    total_points = 0
    max_points = 50 * len(df)
    for index, row in df.iterrows():
        plot_summary = row['gen_summary']
        title = row['title']
        recommendations = model(plot_summary)
        recommendations.reset_index(drop=True, inplace=True)
        if not recommendations.empty and title in recommendations['title'].values:
            position = recommendations[recommendations['title'] == title].index[0]
            points = 50 - position
            total_points += points
            print(position)
        else:
            print(f"Title '{title}' not found in recommendations.")
            total_points += 0
    eval_score = total_points / max_points
    return eval_score

In [25]:
tf_score = evaluate_model(df_test,find_recommendation)
emb_full_score = evaluate_model(df_test,find_recommendation_emb_full)
emb_short_score = evaluate_model(df_test,find_recommendation_emb)

In [47]:
tf_score_short = evaluate_model(df_test, find_recommendation_short)

In [49]:
print(f"TF Vectorizer score: {tf_score}\nTF vectorizer for short summary score: {tf_score_short}\nCosine with full embedded synopsis: {emb_full_score}\nCosine with embedded short summary: {emb_short_score}")

TF Vectorizer score: 0.5284
TF vectorizer for short summary score: 0.9086
Cosine with full embedded synopsis: 0.1425
Cosine with embedded short summary: 0.1661


In [56]:
find_recommendation_short('film about a bear in london happy familt film')

Unnamed: 0,title,similarity
122,Love Happy,0.305081
692,"Hey, Happy!",0.304814
5604,Care Bears Movie II: A New Generation,0.304606
5470,Gold Diggers: The Secret of Bear Mountain,0.233648
5252,Brother Bear,0.228432
20,Paddington,0.223562
3972,Grizzly Man,0.222038
1257,Happy Gilmore,0.218463
3839,She-Wolf of London,0.210943
3736,Piccadilly,0.185838


In [24]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize


In [25]:
full_plot_with_titles = pd.DataFrame({'embedded_plot': [list(row) for row in full_plot_embeddings]})
df_titles = df['title']
full_plot_with_titles = pd.concat([full_plot_with_titles, df_titles], axis = 1)

In [29]:
def find_n_nearest_neighbors(n: int, text):
    '''
    finds n nearest neighbors using the NearestNeighbors algorithm
    '''
    user_token = tokenizer(text, return_tensors="pt")
    user_outputs = model(**user_token)
    user_embedded = user_outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    full_plot_with_titles['embedded_plot'] = full_plot_with_titles['embedded_plot'].apply(lambda x: [x] if np.isscalar(x) else x)

    full_plot_embeddings = np.vstack(full_plot_with_titles['embedded_plot'].values)

    user_embedded_normalized = normalize([user_embedded])
    full_plot_embeddings_normalized = normalize(full_plot_embeddings)
    
    nbrs = NearestNeighbors(n_neighbors=n, algorithm='ball_tree').fit(full_plot_embeddings_normalized)

    distances, indices = nbrs.kneighbors(user_embedded_normalized)
    list_of_titles = full_plot_with_titles['title'].iloc[indices[0]].tolist()
    return list_of_titles

In [30]:
find_n_nearest_neighbors(10, 'film in london about animal happy family film fun')

['Highway 61',
 'Aria',
 'Map of the Human Heart',
 'Party Monster',
 'InAPPropriate Comedy',
 'Diana Vreeland: The Eye Has to Travel',
 'Cecil B. DeMented',
 'Jimmy and Judy',
 'Beyond the Law',
 'Silent Movie']

In [165]:
full_plot_embeddings = full_plot_with_titles.iloc[:, :-1].values