Step 1: Import Necessary Libraries

In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

Step 2: Load Datasets

In [None]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
"""genres =movies_df["genres"].unique()
for genre in genres:
    print(genre)"""

Adventure|Animation|Children|Comedy|Fantasy
Adventure|Children|Fantasy
Comedy|Romance
Comedy|Drama|Romance
Comedy
Action|Crime|Thriller
Adventure|Children
Action
Action|Adventure|Thriller
Comedy|Horror
Adventure|Animation|Children
Drama
Action|Adventure|Romance
Crime|Drama
Drama|Romance
Action|Comedy|Crime|Drama|Thriller
Comedy|Crime|Thriller
Crime|Drama|Horror|Mystery|Thriller
Drama|Sci-Fi
Children|Drama
Adventure|Drama|Fantasy|Mystery|Sci-Fi
Mystery|Sci-Fi|Thriller
Adventure|Romance|IMAX
Documentary|IMAX
Children|Comedy
Drama|War
Action|Crime|Drama
Action|Adventure|Fantasy
Comedy|Drama|Thriller
Mystery|Thriller
Animation|Children|Drama|Musical|Romance
Crime|Mystery|Thriller
Action|Drama|Thriller
Adventure|Drama
Adventure|Children|Comedy|Fantasy
Drama|Mystery
Drama|Thriller
Comedy|Crime
Action|Sci-Fi|Thriller
Action|Comedy|Horror|Thriller
Comedy|Drama
Documentary
Action|Crime|Drama|Thriller
Crime|Drama|Romance
Action|Adventure|Drama
Action|Thriller
Drama|Horror|Thriller
Comedy|Horror|

Step 3: Preprocessing

In [52]:
# Extract the release year from the movie title
movies_df['release_year'] = movies_df['title'].str.extract(r'\((\d{4})\)').astype(float)

# Compute average ratings for each movie
average_ratings = ratings_df.groupby('movieId')['rating'].mean().reset_index()
average_ratings.rename(columns={'rating': 'average_rating'}, inplace=True)

# Merge the movie data with their average ratings
movies_with_ratings = pd.merge(movies_df, average_ratings, on='movieId', how='left')

# Handle missing average ratings by filling with 0
movies_with_ratings['average_rating'] = movies_with_ratings['average_rating'].fillna(0)

# Preprocess 'genres' text for vectorization by cleaning special characters and converting to lowercase
movies_with_ratings['processed_text'] = movies_with_ratings['genres'].str.lower().str.replace(r'[^\w\s]', ' ', regex=True)
for genre in movies_with_ratings['processed_text']:
    print(genre) 

adventure animation children comedy fantasy
adventure children fantasy
comedy romance
comedy drama romance
comedy
action crime thriller
comedy romance
adventure children
action
action adventure thriller
comedy drama romance
comedy horror
adventure animation children
drama
action adventure romance
crime drama
drama romance
comedy
comedy
action comedy crime drama thriller
comedy crime thriller
crime drama horror mystery thriller
action crime thriller
drama sci fi
drama romance
drama
children drama
drama romance
adventure drama fantasy mystery sci fi
crime drama
drama
mystery sci fi thriller
adventure romance imax
children drama
drama romance
crime drama
documentary imax
children comedy
comedy romance
drama
drama war
action crime drama
drama
action adventure fantasy
comedy drama thriller
drama romance
mystery thriller
animation children drama musical romance
drama romance
crime mystery thriller
action drama thriller
comedy drama romance
adventure drama
children comedy
drama
adventure chil

Step 4: Vectorize the Text Data

In [53]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# Fit transform the genre text data
sparse_X = vectorizer.fit_transform(movies_with_ratings['processed_text'])

Step 5: Define Cosine Similarity Logic

In [54]:
def retrieve_similar_movies(query, k=5):
    """
    Retrieve the top k most similar movies to the given query using cosine similarity
    sorted by average rating and release year.
    """
    # Transform the query string using the pre-fitted TF-IDF vectorizer
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(query_vec, sparse_X)  # Calculate similarity against all movies
    
    # Flatten the cosine similarity array to access all computed scores
    similarity_scores = cosine_sim.flatten()
    
    # Attach similarity scores to the DataFrame
    movies_with_ratings['similarity'] = similarity_scores

    # Sort by similarity, average rating, and release year
    sorted_movies = movies_with_ratings.sort_values(by=['similarity', 'average_rating', 'release_year'], 
                                                    ascending=[False, False, False])
    
    # Return the top k movies
    return sorted_movies.head(k)

Step 6: Test the Recommendation Logic

In [55]:
# Example: Query with a test string
query = 'adventure'
similar_movies = retrieve_similar_movies(query)
print(f"Movies similar to the query '{query}':")
print(similar_movies[['title', 'genres', 'average_rating', 'release_year', 'similarity']])

query = 'comedy'
similar_movies = retrieve_similar_movies(query)
print(f"\nMovies similar to the query '{query}':")
print(similar_movies[['title', 'genres', 'average_rating', 'release_year', 'similarity']])

Movies similar to the query 'adventure':
                                                 title     genres  \
15013                           Treasure Island (1934)  Adventure   
27588                                Wolf Totem (2015)  Adventure   
21357  Belle and Sebastien (Belle et Sébastien) (2013)  Adventure   
13145                                Billy Budd (1962)  Adventure   
60446                  The Peanut Butter Falcon (2019)  Adventure   

       average_rating  release_year  similarity  
15013             5.0        1934.0         1.0  
27588             4.5        2015.0         1.0  
21357             4.5        2013.0         1.0  
13145             4.5        1962.0         1.0  
60446             4.0        2019.0         1.0  

Movies similar to the query 'comedy':
                         title  genres  average_rating  release_year  \
59879              Poms (2019)  Comedy             5.0        2019.0   
53474         Candy Jar (2018)  Comedy             5.0       

Step 7: Save Processed Data

In [56]:
import pickle

# Save the processed data
with open('movies_with_ratings.pkl', 'wb') as f:
    pickle.dump(movies_with_ratings, f)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)