In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Reading the data
movies_ml_25m = pd.read_csv('../data/raw/ml-25m/movies.csv')
# ratings_ml_25m = pd.read_csv('../data/raw/ml-25m/ratings.csv')
tags_ml_25m = pd.read_csv('../data/raw/ml-25m/tags.csv')

## Collaborative Filtering with Surprise

In [3]:
# from surprise import Dataset, Reader, SVD
# from surprise.model_selection import train_test_split
# from surprise import accuracy
# 
# # Load the ratings data
# reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# ratings_data_for_surprise = Dataset.load_from_file('../data/raw/ml-25m/ratings.csv', reader=reader)

In [4]:
# # Split the data into training and testing sets
# trainset, testset = train_test_split(ratings_data_for_surprise, test_size=0.1)
# 
# # Use the SVD algorithm
# algo = SVD()
# 
# # Train the model
# algo.fit(trainset)

In [5]:
# Predict and evaluate
# predictions = algo.test(testset)
# rmse = accuracy.rmse(predictions)

In [6]:
# from joblib import dump
# 
# # Save the trained model
# dump(algo, '../models/collaborative_filtering_model.joblib')

## Content-Based Filtering

In [7]:
# Change NaN tags to empty strings
tags_ml_25m['tag'] = tags_ml_25m['tag'].fillna('')
# Merge movies and tags based on movieId
movie_tags = tags_ml_25m.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [8]:
# combine data
movies_with_tags = movies_ml_25m.merge(movie_tags, on='movieId', how='left')

In [9]:
# Fill the NaN values with empty strings
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

In [10]:
# Combine genres and tags into a single feature
movies_with_tags['combined_features'] = movies_with_tags['genres'] + ' ' + movies_with_tags['tag']
movies_with_tags['combined_features'] = movies_with_tags['combined_features'].str.replace('|', ' ')

In [11]:
# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_with_tags['combined_features'])

In [12]:
from scipy.sparse import csr_matrix

# Convert TF-IDF matrix to sparse format
tfidf_sparse_matrix = csr_matrix(tfidf_matrix)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, vstack

# Function to compute cosine similarity in batches and convert to sparse format
def compute_cosine_similarity_in_batches(tfidf_matrix, batch_size=1000):
    n_movies = tfidf_matrix.shape[0]
    sparse_cosine_sim = []

    for start in range(0, n_movies, batch_size):
        end = min(start + batch_size, n_movies)
        # Compute the cosine similarity for the current batch
        batch_cosine_sim = cosine_similarity(tfidf_matrix[start:end], tfidf_matrix)
        # Convert the batch to sparse format
        sparse_batch = csr_matrix(batch_cosine_sim)
        # Append the sparse batch to the list
        sparse_cosine_sim.append(sparse_batch)

    # Stack all sparse batches vertically to form the full sparse matrix
    return vstack(sparse_cosine_sim)

In [14]:
# Compute cosine similarity in batches
cosine_sim_sparse = compute_cosine_similarity_in_batches(tfidf_sparse_matrix, batch_size=500)

In [15]:
# Save the cosine similarity matrix
from joblib import dump

# Save the TF-IDF vectorizer and the cosine similarity matrix
dump(tfidf, '../models/tfidf_vectorizer.joblib')
dump(cosine_sim_sparse, '../models/cosine_similarity_matrix.joblib')

['../models/cosine_similarity_matrix.joblib']

In [16]:
from scipy.sparse import save_npz

save_npz('../models/sparse_similarity_matrix.npz', cosine_sim_sparse)