In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Step 1: Preprocess combined_data.txt in chunks
chunk_size = 100000
data = []

def process_chunk(chunk):
    current_movieId = None
    chunk_data = []
    for line in chunk:
        line = line.strip()
        if line.endswith(':'):
            current_movieId = int(line[:-1])
        else:
            cust_id, rating, date = line.split(',')
            chunk_data.append([current_movieId, int(cust_id), int(rating), date])
    return chunk_data

with open('combined_data_1.txt', 'r') as file:
    chunk = []
    for line in file:
        chunk.append(line)
        if len(chunk) >= chunk_size:
            data.extend(process_chunk(chunk))
            chunk = []
    if chunk:
        data.extend(process_chunk(chunk))

ratings_data = pd.DataFrame(data, columns=['movieId', 'Cust_Id', 'Rating', 'Date'])
ratings_data['Date'] = pd.to_datetime(ratings_data['Date'])

# Step 2: Load movies.csv and merge datasets
movies_data = pd.read_csv('movies (2).csv')
combined_data = pd.merge(ratings_data, movies_data, on='movieId')

# Step 3: Create a sparse utility matrix
pivot_table = combined_data.pivot(index='Cust_Id', columns='movieId', values='Rating').fillna(0)
sparse_matrix = csr_matrix(pivot_table.values)

# Step 4: Apply SVD
svd = TruncatedSVD(n_components=50)  # You can adjust the number of components
matrix_svd = svd.fit_transform(sparse_matrix)

# Step 5: Define a function to recommend movies
def recommend_movies(user_id, genre, top_n=10):
    if user_id not in pivot_table.index:
        return f"User ID {user_id} not found in the dataset."
    
    user_index = pivot_table.index.get_loc(user_id)
    
    # Get the user's rating vector from the SVD matrix
    user_ratings_svd = matrix_svd[user_index]
    
    # Compute the cosine similarity between the user and all movie vectors
    similarity = np.dot(matrix_svd, user_ratings_svd)
    
    # Sort the movies by similarity score
    similar_movies = np.argsort(-similarity)
    
    # Filter for movies in the desired genre
    genre_movies = combined_data[combined_data['genres'].str.contains(genre)]
    genre_movie_ids = genre_movies['movieId'].unique()
    
    # Recommend the top N movies in the genre
    recommended_movie_ids = [movie_id for movie_id in similar_movies if movie_id in genre_movie_ids]
    
    return combined_data[combined_data['movieId'].isin(recommended_movie_ids[:top_n])][['movieId', 'title']]

# Example usage
user_id = 1  # Replace with actual user ID
genre = 'Action'  # Replace with desired genre
recommended_movies = recommend_movies(user_id, genre)
if isinstance(recommended_movies, str):
    print(recommended_movies)
else:
    print(f"Recommended movies for user {user_id} in genre {genre}:")
    print(recommended_movies)

# Find the most popular and liked genre
combined_data['Genres'] = combined_data['genres'].apply(lambda x: x.split('|'))
genre_ratings = combined_data.explode('Genres')
genre_avg_ratings = genre_ratings.groupby('Genres')['Rating'].mean().reset_index()
most_popular_genre = genre_avg_ratings.loc[genre_avg_ratings['Rating'].idxmax()]

print(f"Most popular genre: {most_popular_genre['Genres']} with an average rating of {most_popular_genre['Rating']}")

# Find best and worst-rated genres
genre_avg_ratings = genre_ratings.groupby('Genres')['Rating'].mean().reset_index()
best_genre = genre_avg_ratings.loc[genre_avg_ratings['Rating'].idxmax()]
worst_genre = genre_avg_ratings.loc[genre_avg_ratings['Rating'].idxmin()]

print(f"Best-rated genre: {best_genre['Genres']} with an average rating of {best_genre['Rating']}")
print(f"Worst-rated genre: {worst_genre['Genres']} with an average rating of {worst_genre['Rating']}")

User ID 1 not found in the dataset.
Most popular genre: IMAX with an average rating of 3.8263583259982394
Best-rated genre: IMAX with an average rating of 3.8263583259982394
Worst-rated genre: Documentary with an average rating of 3.5094068246122965
