In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# Define column names for the datasets
item_columns = [
    'movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
    'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

rating_columns = ['user id', 'movie id', 'rating', 'timestamp']

# Read the datasets
movies = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=rating_columns)

# Merge the datasets on 'movie id'
merged_data = pd.merge(movies, ratings, on='movie id')

# Define categories
categories = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

# Create pivot table and replace NaN with 0
datama = ratings.pivot_table(index='user id', columns='movie id', values='rating')
data_matrix = datama.replace(np.nan, 0)

# Calculate pairwise distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

# Prediction function
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

# Ensure inputs are NumPy arrays
data_matrix = np.array(data_matrix)
user_similarity = np.array(user_similarity)
item_similarity = np.array(item_similarity)

# Prediction
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

# Convert user similarity table into DataFrame
user_sim_table = pd.DataFrame(user_similarity)

# Replace NaN values in the original data matrix with the user predictions
predicted_data_matrix = datama.fillna(pd.DataFrame(user_prediction, index=datama.index, columns=datama.columns))

# Display the data matrix with predictions
print("Data Matrix with Predictions:")
print(predicted_data_matrix)

# Function to get top 5 movies for a given category
def get_top_movies(category):
    category_movies = merged_data[merged_data[category] == 1]
    category_movie_ratings = category_movies.groupby(['movie id', 'movie title'])['rating'].mean().reset_index()
    top_category_movies = category_movie_ratings.sort_values(by='rating', ascending=False).head(5)
    return top_category_movies

# Get top 5 movies for each category and display
for category in categories:
    top_movies = get_top_movies(category)
    print(f"Top 5 Rated {category} Movies:")
    print(top_movies)
    print("\n")


Data Matrix with Predictions:
movie id      1         2         3         4         5         6     \
user id                                                                
1         5.000000  3.000000  4.000000  3.000000  3.000000  5.000000   
2         4.000000  0.384040  0.196179  0.731538  0.225643  0.003892   
3         1.795904  0.329047  0.158829  0.684154  0.173277 -0.035621   
4         1.729951  0.293913  0.127741  0.644932  0.142143 -0.062261   
5         4.000000  3.000000  0.354422  0.763130  0.359539  0.195987   
...            ...       ...       ...       ...       ...       ...   
939       1.676950  0.346339  0.177518  0.689906  0.199740  0.003297   
940       1.822346  0.419125  0.286430  2.000000  0.294442  0.106633   
941       5.000000  0.275269  0.102195  0.624383  0.133762 -0.069553   
942       1.810363  0.404799  0.275450  0.726616  0.281316  0.087068   
943       1.838431  5.000000  0.384963  0.780521  0.388442  0.240998   

movie id      7         8        