In [4]:
import pandas as pd

# Load the data
data = pd.read_csv('ratings_small.csv')

# Display the first few rows of the data
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
# Load the movie metadata
movies_metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Display the first few rows of the movie metadata
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
# Handle missing values
movies_metadata['overview'] = movies_metadata['overview'].fillna('')
movies_metadata['tagline'] = movies_metadata['tagline'].fillna('')
movies_metadata['genres'] = movies_metadata['genres'].fillna('')

# Extract genres from the genres column
def extract_genres(genres_str):
    try:
        genres = eval(genres_str)
        return ' '.join([g['name'] for g in genres])
    except:
        return ''

movies_metadata['genres_str'] = movies_metadata['genres'].apply(extract_genres)

# Combine genres, overview, and tagline into a single string
movies_metadata['combined_text'] = movies_metadata['genres_str'] + ' ' + movies_metadata['overview'] + ' ' + movies_metadata['tagline']

# Display the first few rows of the combined text
movies_metadata[['title', 'combined_text']].head()

Unnamed: 0,title,combined_text
0,Toy Story,"Animation Comedy Family Led by Woody, Andy's t..."
1,Jumanji,Adventure Fantasy Family When siblings Judy an...
2,Grumpier Old Men,Romance Comedy A family wedding reignites the ...
3,Waiting to Exhale,"Comedy Drama Romance Cheated on, mistreated an..."
4,Father of the Bride Part II,Comedy Just when George Banks has recovered fr...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Compute the TF-IDF matrix for the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_metadata['combined_text'])

# Display the shape of the TF-IDF matrix
tfidf_matrix.shape

(45466, 77139)

In [8]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
def get_movie_recommendations(title, cosine_sim_matrix=None, top_n=10):
    # Get the index of the movie from its title
    idx = movies_metadata[movies_metadata['title'] == title].index[0]
    
    # If a precomputed cosine similarity matrix is provided, use it
    if cosine_sim_matrix is not None:
        sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    else:
        # Otherwise, compute the cosine similarity scores on-the-fly
        cosine_sim_vector = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
        sim_scores = list(enumerate(cosine_sim_vector))
    
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar movies' indices
    sim_movie_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    # Return the top N most similar movies
    return movies_metadata['title'].iloc[sim_movie_indices]

# Test the recommendation function with a sample movie
sample_movie = "Star Wars"
recommendations = get_movie_recommendations(sample_movie)
recommendations


1154           The Empire Strikes Back
30434    The Star Wars Holiday Special
1167                Return of the Jedi
26555     Star Wars: The Force Awakens
22939               Threads of Destiny
8850                   The Ice Pirates
22118               The Galaxy Invader
20812                           Erased
26240            Princess and the Pony
1267                      Mad Dog Time
Name: title, dtype: object