In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from recommender_helper import content_movie_recommender

### Change the below to match eda.ipynb

In [None]:
%reload_ext sql
%sql duckdb:///../../movies_data.duckdb

In [None]:
df = %sql select * from movie_genre_data
df = pd.DataFrame(df)
df

In [None]:
# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["overview"])

In [None]:
# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(
    similarity, index=df.title.values, columns=df.title.values
)  # noqa E501
similarity_df.head(5)

In [None]:
movie_list = similarity_df.columns.values

In [None]:
sample_movies = ["Spider-Man: Across the Spider-Verse"]

for movie in sample_movies:
    content_movie_recommender(movie, similarity_df, movie_list, 10)

### Using both genre and overview columns

Let's now try to include the genres of the movies to our recommendation system. To do so, we're going to create a `combined` column that includes both a movie's "overview" and "genre(s)". 

We can adjust the "weight" of how genres influence our recommendation system by deciding how many times they appear in the `combined` column.

In [None]:
df["combined"] = (
    df["overview"] + " " + (df["genre_names"] + ", ") * 2
)  # Duplicate genres to give more weight, doesn't really affect the model
df.combined[0]

In [None]:
tfidf_combined = TfidfVectorizer(stop_words="english")
tfidf_matrix_combined = tfidf_combined.fit_transform(df["combined"])

In [None]:
similarity_combined = cosine_similarity(tfidf_matrix_combined)

similarity_df_combined = pd.DataFrame(
    similarity_combined, index=df.title.values, columns=df.title.values
)

similarity_df_combined.head(5)

In [None]:
combined_movie_list = similarity_df_combined.columns.values

In [None]:
sample_movies = ["Spider-Man: Across the Spider-Verse"]

for movie in sample_movies:
    content_movie_recommender(
        movie, similarity_df_combined, combined_movie_list, 10
    )  # noqa E501