In [1]:
# Importing Dependencies
from pathlib import Path
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Filter Warnings
from warnings import filterwarnings

In [2]:
# Loading cleaned data
df_path = Path('cleaned_data/movie_data.csv')
df = pd.read_csv(df_path)

df.head()

Unnamed: 0.1,Unnamed: 0,id,title,director,cast,country,release_year,genre_types,rating,duration,description,popularity,production_companies,writers,combined,original_title
0,0,653574,dick johnson is dead,Kirsten Johnson,,United States,2020,Documentaries,PG-13,90 min,"As her father nears the end of his life, filmm...",12.0,Big Mouth Productions,"Kirsten Johnson, Nels Bangerter",kirsten johnsonunited states2020,Dick Johnson Is Dead
1,1,597316,my little pony: a new generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021,Children & Family Movies,PG,91 min,Equestria's divided. But a bright-eyed hero be...,25.85,"Boulder Media, Entertainment One","Gillian M. Berrow, Tim Sullivan","robert cullen, josé luis uchavanessa hudgens, ...",My Little Pony: A New Generation
2,2,68351,sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",1993,"Dramas, Independent Movies, International Movies",TV-MA,125 min,"On a photo shoot in Ghana, an American model s...",3.48,"Diproci, Ghana National Commission on Culture,...",Haile Gerima,"haile gerimakofi ghanaba, oyafunmike ogunlano,...",Sankofa
3,3,468225,the starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021,"Comedies, Dramas",PG-13,104 min,A woman adjusting to life after a loss contend...,15.47,"Entertainment One, Boies/Schiller Film Group, ...",Matt Harris,"theodore melfimelissa mccarthy, chris o'dowd, ...",The Starling
4,4,786705,confessions of an invisible girl,Bruno Garotti,"Klara Castanho, Lucca Picon, Júlia Gomes, Marc...",,2021,"Children & Family Movies, Comedies",TV-PG,91 min,When the clever but socially-awkward Tetê join...,17.89,,Thalita Rebouças,"bruno garottiklara castanho, lucca picon, júli...",Confessions of an Invisible Girl


In [3]:
# Confirming data shape, to ensure results are accurate
df.shape

(5143, 16)

# NLP Machine Learning

In [4]:
# TF-IDF Vectorization to assess importance of each word in 'combined' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

In [5]:
# Measures similarity among movies based on their 'description' 
similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
# Accessing similarity scores, expecting it to correspond to 5143x5143 dimension 
simsy = pd.DataFrame(similarity)
simsy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5133,5134,5135,5136,5137,5138,5139,5140,5141,5142
0,1.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.0,1.000000,0.0,0.000000,0.0,0.017302,0.0,0.0,0.0,0.014726,...,0.010539,0.026542,0.0,0.0,0.000000,0.0,0.017951,0.000000,0.011487,0.026805
2,0.0,0.000000,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.021701,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.072947,...,0.000000,0.000000,0.0,0.0,0.033603,0.0,0.000000,0.000000,0.018319,0.000000
4,0.0,0.000000,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5138,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,1.0,0.000000,0.000000,0.000000,0.000000
5139,0.0,0.017951,0.0,0.000000,0.0,0.029128,0.0,0.0,0.0,0.012914,...,0.011897,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.041980,0.000000
5140,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,1.000000,0.068168,0.000000
5141,0.0,0.011487,0.0,0.018319,0.0,0.000000,0.0,0.0,0.0,0.020960,...,0.011629,0.026785,0.0,0.0,0.000000,0.0,0.041980,0.068168,1.000000,0.000000


In [9]:
# Building the function
def get_recs(movie_title, df, similarity, columns=['title', 'release_year', 'genre_types', 'description', 'rating', 'original_title']):
 
    # Searching inputted movie in dataset
    idx = df[df['title'] == movie_title].index
    if idx.empty:
        return f"Movie '{movie_title}' not found in the dataset."

    # Grabbing the first index from the list 
    idx = idx[0]

    # Fetching similarity scores of corresponding movies
    sim_scores = list(enumerate(similarity[idx]))

    # Sorting the movies based on similarity scores using lambda
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Grabbing first 10 movies, indices along with similarity scores
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]

    # Filtering dataframe with the recommended movies 
    recommended_movies = df.iloc[movie_indices][columns]

    # Adding similarity scores to the DataFrame for asssessment
    recommended_movies['similarity_score'] = similarity_scores

    return recommended_movies

# Debugging/Testing
recommended_movies = get_recs('insidious',df, similarity)
print(recommended_movies)


                    title  release_year  \
3309    a dangerous woman          1993   
766   the next three days          2010   
4711       the black room          2016   
717       the conjuring 2          2016   
4940           the signal          2014   
2012          i am mother          2019   
281            winchester          2018   
2699                benji          2018   
4885          the natural          1984   
4232   miss sharon jones!          2015   

                                            genre_types  \
3309                            Dramas, Romantic Movies   
766                                   Dramas, Thrillers   
4711                                      Horror Movies   
717                                       Horror Movies   
4940                        Sci-Fi & Fantasy, Thrillers   
2012  International Movies, Sci-Fi & Fantasy, Thrillers   
281                     Horror Movies, Sci-Fi & Fantasy   
2699                   Children & Family Movies, Drama