Step 1: Import Necessary Libraries

In [22]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

Step 2: Load Datasets

In [23]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

Step 3: Preprocessing

In [24]:
# Extract the release year from the movie title
movies_df['release_year'] = movies_df['title'].str.extract(r'\((\d{4})\)').astype(float)

# Compute average ratings for each movie
average_ratings = ratings_df.groupby('movieId')['rating'].mean().reset_index()
average_ratings.rename(columns={'rating': 'average_rating'}, inplace=True)

# Merge the movie data with their average ratings
movies_with_ratings = pd.merge(movies_df, average_ratings, on='movieId', how='left')

# Handle missing average ratings by filling with 0
movies_with_ratings['average_rating'] = movies_with_ratings['average_rating'].fillna(0)

# Preprocess 'genres' text for vectorization by cleaning special characters and converting to lowercase
movies_with_ratings['processed_text'] = movies_with_ratings['genres'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

Step 4: Vectorize the Text Data

In [25]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# Fit transform the genre text data
sparse_X = vectorizer.fit_transform(movies_with_ratings['processed_text'])

Step 5: Define Cosine Similarity Logic

In [None]:
def retrieve_similar_movies(query, k=20):
    """
    Retrieve the top k most similar movies to the given query using cosine similarity
    sorted by average rating and release year.
    """
    # Transform the query string using the pre-fitted TF-IDF vectorizer
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(query_vec, sparse_X)  # Calculate similarity against all movies
    
    # Flatten the cosine similarity array to access all computed scores
    similarity_scores = cosine_sim.flatten()
    
    # Attach similarity scores to the DataFrame
    movies_with_ratings['similarity'] = similarity_scores

    # Sort by similarity, average rating, and release year
    sorted_movies = movies_with_ratings.sort_values(by=['similarity', 'average_rating', 'release_year'], 
                                                    ascending=[False, False, False])
    
    # Return the top k movies
    return sorted_movies.head(k)

Step 6: Test the Recommendation Logic

In [27]:
# Example: Query with a test string
query = 'adventure'
similar_movies = retrieve_similar_movies(query)
print(f"Movies similar to the query '{query}':")
print(similar_movies[['title', 'genres', 'average_rating', 'release_year', 'similarity']])

query = 'comedy'
similar_movies = retrieve_similar_movies(query)
print(f"\nMovies similar to the query '{query}':")
print(similar_movies[['title', 'genres', 'average_rating', 'release_year', 'similarity']])

Movies similar to the query 'adventure':
                                                 title              genres  \
15013                           Treasure Island (1934)           Adventure   
27588                                Wolf Totem (2015)           Adventure   
21357  Belle and Sebastien (Belle et Sébastien) (2013)           Adventure   
13145                                Billy Budd (1962)           Adventure   
60446                  The Peanut Butter Falcon (2019)           Adventure   
...                                                ...                 ...   
62071                           Tales of Found Footage  (no genres listed)   
62104                                 Enduring Destiny  (no genres listed)   
62285      Punk the Capital: Building a Sound Movement         Documentary   
62326                     Yosemite: The Fate of Heaven  (no genres listed)   
62380              The Falklands War: The Untold Story  (no genres listed)   

       average_rating 

Step 7: Save Processed Data

In [28]:
import pickle

# Save the processed data
with open('movies_with_ratings.pkl', 'wb') as f:
    pickle.dump(movies_with_ratings, f)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)