In [1]:
import dask.dataframe as dd

# Load the datasets
final_movies = dd.read_csv('Data/final_dataset.csv')
new_ratings = dd.read_csv('Data/ratings.csv')
genome = dd.read_csv('Data/new_genome.csv')

# Merge the genome dataset with the final_movies dataset on movieId
genome_merged = dd.merge(genome, final_movies[['movieId', 'title', 'genres']], on='movieId')

# Prepare the collaborative filtering data
collab_data = new_ratings[['userId', 'movieId', 'rating']]

# Prepare the content-based filtering data
content_data = genome_merged[['movieId', 'title', 'genres', 'tagId', 'relevance']]

# Ensure no missing values
collab_data = collab_data.dropna()
content_data = content_data.dropna()


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
import dask.dataframe as dd
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Load the datasets
new_ratings = dd.read_csv('Data/ratings.csv')

# Convert 'movieId' to a categorical dtype and ensure known categories
new_ratings['movieId'] = new_ratings['movieId'].astype('category').cat.as_known()


In [3]:
# Use Dask's pivot_table to create a user-item ratings matrix
ratings_matrix = new_ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Fill missing values with 0
ratings_matrix = ratings_matrix.fillna(0)

In [4]:
# Optimize Dask execution
ratings_matrix = ratings_matrix.persist()

# Convert the Dask DataFrame to a Pandas DataFrame
ratings_matrix = ratings_matrix.compute()


: 

In [None]:
# Convert the user-item matrix to a sparse matrix format
ratings_sparse = csr_matrix(ratings_matrix.values)

# Apply SVD
svd = TruncatedSVD(n_components=50)
svd_matrix = svd.fit_transform(ratings_sparse)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# One-hot encode genres
genres_one_hot = final_movies['genres'].str.get_dummies(sep='|')

# Create a genome relevance matrix
genome_pivot = content_data.pivot(index='movieId', columns='tagId', values='relevance').fillna(0)

# Combine genres and genome relevance
content_matrix = pd.concat([genres_one_hot, genome_pivot], axis=1, join='inner')

# Ensure the indices of final_movies match the content_matrix
final_movies = final_movies[final_movies['movieId'].isin(content_matrix.index)]
final_movies = final_movies.reset_index(drop=True)
content_matrix = content_matrix.loc[final_movies['movieId']]

# Compute cosine similarity matrix
cosine_sim_content = cosine_similarity(content_matrix, content_matrix)


In [None]:
def hybrid_recommendations(user_id, movie_title=None, genre=None, n_recommendations=10):
    recommendations = []
    
    # Collaborative Filtering Recommendations
    if user_id is not None:
        user_idx = ratings_matrix.index.get_loc(user_id)
        user_ratings = svd_matrix[user_idx]
        pred_ratings = svd.inverse_transform(user_ratings)
        pred_ratings = pd.Series(pred_ratings, index=ratings_matrix.columns)
        collab_recommendations = pred_ratings.sort_values(ascending=False).head(n_recommendations).index.tolist()
        recommendations.extend(collab_recommendations)
    
    # Content-Based Recommendations by Movie
    if movie_title:
        idx = final_movies.index[final_movies['title'] == movie_title].tolist()[0]
        sim_scores_content = list(enumerate(cosine_sim_content[idx]))
        sim_scores = sorted(sim_scores_content, key=lambda x: x[1], reverse=True)
        content_recommendations = [final_movies.iloc[i[0]]['title'] for i in sim_scores[1:n_recommendations+1]]
        recommendations.extend(content_recommendations)
    
    # Content-Based Recommendations by Genre
    if genre:
        genre_idx = final_movies.index[final_movies['genres'].apply(lambda x: genre in x.split('|'))].tolist()
        sim_scores_genres = cosine_sim_content[genre_idx].mean(axis=0)
        sim_scores = sorted(list(enumerate(sim_scores_genres)), key=lambda x: x[1], reverse=True)
        genre_recommendations = [final_movies.iloc[i[0]]['title'] for i in sim_scores[1:n_recommendations+1]]
        recommendations.extend(genre_recommendations)
    
    # Combine and deduplicate recommendations
    recommendations = list(dict.fromkeys(recommendations))
    return recommendations[:n_recommendations]

# Example usage
user_id = 1
recommendations = hybrid_recommendations(user_id=user_id, movie_title='Toy Story (1995)', n_recommendations=10)
print(recommendations)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Define function to compute evaluation metrics
def evaluate_recommendations(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    return precision, recall, f1

# Example evaluation
y_true = [1, 0, 0, 1, 1]  # Actual preferences
y_pred = [1, 1, 0, 1, 0]  # Predicted recommendations

precision, recall, f1 = evaluate_recommendations(y_true, y_pred)
print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")
