In [1]:
# Import Dependencies
from pathlib import Path
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Filter Warnings
from warnings import filterwarnings

In [2]:
# Loading cleaned data
df_path = Path('cleaned_data/movie_data.csv')
df = pd.read_csv(df_path)

df.head()

Unnamed: 0.1,Unnamed: 0,id,title,director,cast,country,release_year,genre_types,rating,duration,description,popularity,production_companies,writers,combined
0,0,653574,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,Documentaries,PG-13,90 min,"As her father nears the end of his life, filmm...",12.0,Big Mouth Productions,"Kirsten Johnson, Nels Bangerter",Kirsten JohnsonUnited States2020
1,1,597316,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021,Children & Family Movies,PG,91 min,Equestria's divided. But a bright-eyed hero be...,25.85,"Boulder Media, Entertainment One","Gillian M. Berrow, Tim Sullivan","Robert Cullen, José Luis UchaVanessa Hudgens, ..."
2,2,68351,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",1993,"Dramas, Independent Movies, International Movies",TV-MA,125 min,"On a photo shoot in Ghana, an American model s...",3.48,"Diproci, Ghana National Commission on Culture,...",Haile Gerima,"Haile GerimaKofi Ghanaba, Oyafunmike Ogunlano,..."
3,3,468225,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021,"Comedies, Dramas",PG-13,104 min,A woman adjusting to life after a loss contend...,15.47,"Entertainment One, Boies/Schiller Film Group, ...",Matt Harris,"Theodore MelfiMelissa McCarthy, Chris O'Dowd, ..."
4,4,786705,Confessions of an Invisible Girl,Bruno Garotti,"Klara Castanho, Lucca Picon, Júlia Gomes, Marc...",,2021,"Children & Family Movies, Comedies",TV-PG,91 min,When the clever but socially-awkward Tetê join...,17.89,,Thalita Rebouças,"Bruno GarottiKlara Castanho, Lucca Picon, Júli..."


In [3]:
# Confirming data shape, to ensure results are not inaccurate
df.shape

(5143, 15)

# NLP Machine Learning

In [7]:
# TF-IDF Vectorization to assess importance of each word in 'combined' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

In [9]:
# Measures similarity among movies based on their 'description' 
similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
# Creating function to find similar movies based on movie name given
def get_recs(title,similarity=similarity):
    movie_index = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with the given movie
    sim_scores = list(enumerate(similarity[movie_index]))

    # Sorting the movies based on the similarity scores, in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # grabbing top 10 similar movies
    sim_scores = sim_scores[1:11]
    # getting movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return top 10 movies
    return df['title'].iloc[movie_indices]
    #recs = [(df['title'].iloc[i], score) for i, score in sim_scores]
    #return recs
    
# Printing Recommendations
print(get_recs('Insidious'))


3309      A Dangerous Woman
766     The Next Three Days
4711         The Black Room
717         The Conjuring 2
4940             The Signal
2012            I Am Mother
281              Winchester
2699                  Benji
4885            The Natural
4232     Miss Sharon Jones!
Name: title, dtype: object


In [5]:
# NOT SURE IF ILL NEEDIT
from gensim.models import Word2Vec

# Define movie genres
genres = ["Action", "Comedy", "Drama", "Science Fiction", "Horror", "Romance", "Thriller", "Adventure", "Fantasy"]

# Example movie genre data (replace with your actual genre data)
genre_data = [["Action"], ["Comedy"], ["Drama"], ["Science Fiction"], ["Horror"], ["Romance"], ["Thriller"], ["Adventure"], ["Fantasy"]]

# Train Word2Vec model on genre data
model = Word2Vec(sentences=genre_data, vector_size=100, window=5, min_count=1, sg=1)

# Get genre embeddings
genre_embeddings = {genre: model.wv[genre] for genre in genres}

# Example usage: Get embedding for the genre "Action"
print(genre_embeddings["Action"])


[-9.5785465e-03  8.9431154e-03  4.1650687e-03  9.2347348e-03
  6.6435025e-03  2.9247368e-03  9.8040197e-03 -4.4246409e-03
 -6.8033109e-03  4.2273807e-03  3.7290000e-03 -5.6646108e-03
  9.7047603e-03 -3.5583067e-03  9.5494064e-03  8.3472609e-04
 -6.3384566e-03 -1.9771170e-03 -7.3770545e-03 -2.9795230e-03
  1.0416972e-03  9.4826873e-03  9.3558477e-03 -6.5958775e-03
  3.4751510e-03  2.2755705e-03 -2.4893521e-03 -9.2291720e-03
  1.0271263e-03 -8.1657059e-03  6.3201892e-03 -5.8000805e-03
  5.5354391e-03  9.8337233e-03 -1.6000033e-04  4.5284927e-03
 -1.8094003e-03  7.3607611e-03  3.9400971e-03 -9.0103243e-03
 -2.3985039e-03  3.6287690e-03 -9.9568366e-05 -1.2012708e-03
 -1.0554385e-03 -1.6716016e-03  6.0495257e-04  4.1650953e-03
 -4.2527914e-03 -3.8336217e-03 -5.2816868e-05  2.6935578e-04
 -1.6880632e-04 -4.7855065e-03  4.3134023e-03 -2.1719194e-03
  2.1035396e-03  6.6652300e-04  5.9696771e-03 -6.8423809e-03
 -6.8157101e-03 -4.4762576e-03  9.4358288e-03 -1.5918827e-03
 -9.4292425e-03 -5.45041

In [46]:
def get_recs(title, df, similarity, columns=['title', 'release_year', 'genre_types', 'description']):
    # Filter movies based on release year and genre
    idx = df[df['title'] == title].index
    if idx.empty:
        return f"Movie '{title}' not found in the dataset."

    # Get the first index from the list (assuming there's only one match)
    idx = idx[0]

    # Get similarity scores of all movies with the given movie
    sim_scores = list(enumerate(similarity[idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 10 similar movies (excluding the given movie itself)
    sim_scores = sim_scores[1:11]

    # Get indices of the top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]

    # Get DataFrame of recommended movies with specified columns
    recommended_movies = df.iloc[movie_indices][columns]

    return recommended_movies

# Example usage:
recommended_movies = get_recs('Insidious',df, similarity)
print(recommended_movies)


                    title  release_year  \
3309    A Dangerous Woman          1993   
766   The Next Three Days          2010   
4711       The Black Room          2016   
717       The Conjuring 2          2016   
4940           The Signal          2014   
2012          I Am Mother          2019   
281            Winchester          2018   
2699                Benji          2018   
4885          The Natural          1984   
4232   Miss Sharon Jones!          2015   

                                            genre_types  \
3309                            Dramas, Romantic Movies   
766                                   Dramas, Thrillers   
4711                                      Horror Movies   
717                                       Horror Movies   
4940                        Sci-Fi & Fantasy, Thrillers   
2012  International Movies, Sci-Fi & Fantasy, Thrillers   
281                     Horror Movies, Sci-Fi & Fantasy   
2699                   Children & Family Movies, Drama