In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
# Define column names for the datasets
item_columns = [
    'movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
    'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

rating_columns = ['user id', 'movie id', 'rating', 'timestamp']

# Read the datasets
movies = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=rating_columns)

# Merge the datasets on 'movie id'
merged_data = pd.merge(movies, ratings, on='movie id')

# Define categories
categories = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]


In [3]:
# Create pivot table and replace NaN with 0
datama = ratings.pivot_table(index='user id', columns='movie id', values='rating')
data_matrix = datama.replace(np.nan, 0)

# Display the data matrix
print("Data Matrix:")
print(data_matrix)


Data Matrix:
movie id  1     2     3     4     5     6     7     8     9     10    ...  \
user id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   
940        0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   
941        5.0   0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...   
942        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
943        0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   3.0 

In [4]:
# Calculate user and item similarities using cosine distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

# Display the similarities
print("User Similarity Matrix:")
print(user_similarity)
print("\nItem Similarity Matrix:")
print(item_similarity)

User Similarity Matrix:
[[2.22044605e-15 8.33069016e-01 9.52540457e-01 ... 8.51383057e-01
  8.20492117e-01 6.01825261e-01]
 [8.33069016e-01 0.00000000e+00 8.89408675e-01 ... 8.38515222e-01
  8.27732187e-01 8.94202122e-01]
 [9.52540457e-01 8.89408675e-01 0.00000000e+00 ... 8.98757435e-01
  8.66583851e-01 9.73444131e-01]
 ...
 [8.51383057e-01 8.38515222e-01 8.98757435e-01 ... 0.00000000e+00
  8.98358201e-01 9.04880419e-01]
 [8.20492117e-01 8.27732187e-01 8.66583851e-01 ... 8.98358201e-01
  0.00000000e+00 8.17535338e-01]
 [6.01825261e-01 8.94202122e-01 9.73444131e-01 ... 9.04880419e-01
  8.17535338e-01 0.00000000e+00]]

Item Similarity Matrix:
[[0.00000000e+00 5.97617822e-01 6.69755213e-01 ... 1.00000000e+00
  9.52816933e-01 9.52816933e-01]
 [5.97617822e-01 0.00000000e+00 7.26930825e-01 ... 1.00000000e+00
  9.21700637e-01 9.21700637e-01]
 [6.69755213e-01 7.26930825e-01 2.22044605e-16 ... 1.00000000e+00
  1.00000000e+00 9.03124947e-01]
 ...
 [1.00000000e+00 1.00000000e+00 1.00000000e+00 ..

In [5]:
# Function to get top 5 movies for a given category
def get_top_movies(category):
    category_movies = merged_data[merged_data[category] == 1]
    category_movie_ratings = category_movies.groupby(['movie id', 'movie title'])['rating'].mean().reset_index()
    top_category_movies = category_movie_ratings.sort_values(by='rating', ascending=False).head(5)
    return top_category_movies

# Get top 5 movies for each category and display
for category in categories:
    top_movies = get_top_movies(category)
    print(f"Top 5 Rated {category} Movies:")
    print(top_movies)
    print("\n")

Top 5 Rated Action Movies:
    movie id                      movie title    rating
11        50                 Star Wars (1977)  4.358491
28       127            Godfather, The (1972)  4.283293
38       174   Raiders of the Lost Ark (1981)  4.252381
80       313                   Titanic (1997)  4.245714
36       172  Empire Strikes Back, The (1980)  4.204360


Top 5 Rated Adventure Movies:
     movie id                      movie title    rating
120      1293                  Star Kid (1997)  5.000000
5          50                 Star Wars (1977)  4.358491
24        174   Raiders of the Lost Ark (1981)  4.252381
65        511        Lawrence of Arabia (1962)  4.231214
22        172  Empire Strikes Back, The (1980)  4.204360


Top 5 Rated Animation Movies:
    movie id                                        movie title    rating
13       408                              Close Shave, A (1995)  4.491071
8        169                         Wrong Trousers, The (1993)  4.466102
7        