In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from recommender_helper import content_movie_recommender

### Change the below to match eda.ipynb

In [4]:
%reload_ext sql
%sql duckdb:///../../movies_data.duckdb

In [5]:
df = %sql select * from movie_genre_data
df = pd.DataFrame(df)
df

Unnamed: 0,genre_names,id,original_language,overview,popularity,release_date,title,vote_average,vote_count
0,"Animation, Comedy, Family, Fantasy, Romance",976573,en,"In a city where fire, water, land and air resi...",4411.076,2023-06-14,Elemental,7.8,1500
1,"Action, Science Fiction, Horror",615656,en,An exploratory dive into the deepest depths of...,3247.593,2023-08-02,Meg 2: The Trench,6.9,828
2,"Thriller, Action",724209,en,An intelligence operative for a shadowy global...,2899.650,2023-08-09,Heart of Stone,6.9,824
3,"Animation, Action, Adventure",569094,en,"After reuniting with Gwen Stacy, Brooklyn’s fu...",1872.207,2023-05-31,Spider-Man: Across the Spider-Verse,8.4,3811
4,"Comedy, Adventure, Fantasy",346698,en,Barbie and Ken are having the time of their li...,1869.493,2023-07-19,Barbie,7.4,3433
...,...,...,...,...,...,...,...,...,...
490,Comedy,239563,en,A young boy whose parents just divorced finds ...,93.375,2014-10-09,St. Vincent,7.1,1710
491,"Horror, Thriller",760161,en,After escaping from an Estonian psychiatric fa...,90.271,2022-07-27,Orphan: First Kill,6.7,1801
492,"Family, Fantasy, Romance",321612,en,A live-action adaptation of Disney's version o...,79.570,2017-03-16,Beauty and the Beast,7.0,14821
493,"Crime, Drama, Thriller",842942,en,"After escaping a Michigan prison, a charming c...",76.312,2022-09-23,Bandit,6.2,207


In [6]:
# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["overview"])

In [26]:
# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(
    similarity, index=df.title.values, columns=df.title.values
)  # noqa E501
similarity_df.head(5)

Unnamed: 0,Elemental,Meg 2: The Trench,Heart of Stone,Spider-Man: Across the Spider-Verse,Barbie,The Flash,Cobweb,Babylon 5: The Road Home,Fast X,No Hard Feelings,...,"Ready, Jet, Go! Space Camp: The Movie",American Carnage,Blade Runner 2049,Fear the Invisible Man,The King's Man,St. Vincent,Orphan: First Kill,Beauty and the Beast,Bandit,Captain America: Civil War
Elemental,1.0,0.008911,0.006457,0.055169,0.130816,0.04796,0.011373,0.032326,0.059454,0.002879,...,0.048472,0.070145,0.004119,0.050329,0.00596,0.03681,0.015662,0.102265,0.019744,0.011245
Meg 2: The Trench,0.008911,1.0,0.020794,0.035152,0.049512,0.026546,0.010067,0.02036,0.0283,0.03273,...,0.043846,0.031304,0.069181,0.018413,0.02756,0.015012,0.021834,0.02936,0.015127,0.051357
Heart of Stone,0.006457,0.020794,1.0,0.020859,0.013872,0.024718,0.035961,0.007452,0.038257,0.024718,...,0.017508,0.020306,0.010836,0.015812,0.049341,0.013434,0.030184,0.00374,0.007909,0.01749
Spider-Man: Across the Spider-Verse,0.055169,0.035152,0.020859,1.0,0.054745,0.106811,0.0284,0.105084,0.08573,0.033223,...,0.05563,0.034266,0.063404,0.07056,0.079138,0.023394,0.047364,0.022846,0.046555,0.044975
Barbie,0.130816,0.049512,0.013872,0.054745,1.0,0.090942,0.016459,0.046423,0.086739,0.044911,...,0.078416,0.120896,0.030606,0.029163,0.051873,0.04876,0.03005,0.049331,0.037177,0.060203


In [8]:
movie_list = similarity_df.columns.values

In [30]:
type(movie_list)

numpy.ndarray

In [14]:
sample_movies = ["Spider-Man: Across the Spider-Verse"]

for movie in sample_movies:
    content_movie_recommender(movie, similarity_df, movie_list, 10)



Top Recommended Movies for: Spider-Man: Across the Spider-Verse are:-
 ['Spider-Man: Into the Spider-Verse' 'The Amazing Spider-Man 2'
 'Spider-Man' 'Spider-Man 3' 'Thor: Ragnarok' 'Spider-Man: Homecoming'
 'The Amazing Spider-Man' 'Doctor Strange in the Multiverse of Madness'
 'Man of Steel' "Accident Man: Hitman's Holiday"]


### Using both genre and overview columns

In [15]:
df["combined"] = (
    df["overview"] + " " + (df["genre_names"] + ", ") * 2
)  # Duplicate genres to give more weight, doesn't really affect the model
df.combined[0]

'In a city where fire, water, land and air residents live together, a fiery young woman and a go-with-the-flow guy will discover something elemental: how much they have in common. Animation, Comedy, Family, Fantasy, Romance, Animation, Comedy, Family, Fantasy, Romance, '

In [16]:
tfidf_combined = TfidfVectorizer(stop_words="english")
tfidf_matrix_combined = tfidf_combined.fit_transform(df["combined"])

In [25]:
similarity_combined = cosine_similarity(tfidf_matrix_combined)

similarity_df_combined = pd.DataFrame(
    similarity_combined, index=df.title.values, columns=df.title.values
)

similarity_df_combined.head(5)

Unnamed: 0,Elemental,Meg 2: The Trench,Heart of Stone,Spider-Man: Across the Spider-Verse,Barbie,The Flash,Cobweb,Babylon 5: The Road Home,Fast X,No Hard Feelings,...,"Ready, Jet, Go! Space Camp: The Movie",American Carnage,Blade Runner 2049,Fear the Invisible Man,The King's Man,St. Vincent,Orphan: First Kill,Beauty and the Beast,Bandit,Captain America: Civil War
Elemental,1.0,0.0,0.0,0.034904,0.140862,0.012993,0.0,0.041021,0.025744,0.089481,...,0.063136,0.051625,0.0,0.042484,0.0,0.065307,0.030367,0.266816,0.0,0.0
Meg 2: The Trench,0.0,1.0,0.024407,0.029075,0.0,0.08126,0.038962,0.08832,0.015264,0.0,...,0.081125,0.033254,0.088024,0.114707,0.022914,0.0,0.039885,0.010977,0.0,0.106471
Heart of Stone,0.0,0.024407,1.0,0.018181,0.0,0.019659,0.027418,0.021367,0.050375,0.0,...,0.0,0.030388,0.0,0.035782,0.113666,0.0,0.036448,0.014006,0.033516,0.021136
Spider-Man: Across the Spider-Verse,0.034904,0.029075,0.018181,1.0,0.031729,0.065768,0.0,0.073199,0.053809,0.0,...,0.050219,0.0,0.036303,0.012343,0.070627,0.016406,0.0,0.008177,0.007,0.038009
Barbie,0.140862,0.0,0.0,0.031729,1.0,0.034546,0.0,0.019368,0.0,0.044234,...,0.017654,0.047627,0.0,0.0,0.051024,0.07316,0.0,0.039682,0.0,0.037142


In [27]:
combined_movie_list = similarity_df_combined.columns.values

In [20]:
sample_movies = ["Spider-Man: Across the Spider-Verse"]

for movie in sample_movies:
    content_movie_recommender(movie, similarity_df_combined, combined_movie_list, 10)



Top Recommended Movies for: Spider-Man: Across the Spider-Verse are:-
 ['Spider-Man: Into the Spider-Verse' 'The Amazing Spider-Man 2'
 'Spider-Man' 'Spider-Man 3' 'Spider-Man: Homecoming'
 'Doctor Strange in the Multiverse of Madness' 'Spider-Man: No Way Home'
 'Ice Age: Dawn of the Dinosaurs' 'Big Hero 6' 'Thor: Ragnarok']
