In [89]:
import pandas as pd
import numpy as np
import os 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
import random
from datetime import datetime
import heapq

In [90]:
def collaborative_filtering_user(R, id, N=30):
    k=30
    row = R.loc[id]
    kNN = NearestNeighbors(n_neighbors = k + 1, algorithm="brute", metric='cosine')
    kNN.fit(R)
    distances, indices = kNN.kneighbors(row.values.reshape(1, -1), return_distance=True)
    similarities = 1 / (distances[0][1:] + 1e-8)
    similarities = similarities / np.sum(similarities)
    user_ratings = row.to_numpy() 
    predicted_ratings = np.zeros_like(user_ratings, dtype='float64')
    for idx, neighbor_idx in enumerate(indices[0][1:]):
        neighbor_ratings = R.iloc[neighbor_idx].to_numpy() 
        predicted_ratings += similarities[idx] * neighbor_ratings
    already_rated = R.loc[id] > 0
    predicted_ratings[already_rated] = 0
    #top_movies_indices = np.argsort(predicted_ratings)[::-1][:k]
    #return [idx for idx in top_movies_indices]
    top_movies_indices = heapq.nlargest(N, range(len(predicted_ratings)), key=lambda i: predicted_ratings[i])
    return [idx for idx in top_movies_indices]

In [91]:
def collaborative_filtering_item(R, id, N=30):
    #item_matrix = pd.DataFrame(cosine_similarity(R.T),
    #                           index=R.columns,
    #                           columns=R.columns)
    item_matrix = cosine_similarity(R.T)
    #item_matrix =  1 - pairwise_distances(R.values.T, metric='jaccard')
    #item_matrix = np.dot(R.T, R)
    #print(user_matrix)
    #print(item_matrix)
    #print(norm, score)
    predicted_ratings = np.dot(item_matrix, R.loc[id].values) / (np.sum(item_matrix, axis=1)+ 1e-8)
    #predictions = pd.Series(predicted_ratings, index=R.columns)
    already_rated = R.loc[id] > 0
    predicted_ratings[already_rated] = 0
    top_movies_indices = heapq.nlargest(N, range(len(predicted_ratings)), key=lambda i: predicted_ratings[i])
    return [idx for idx in top_movies_indices]
    #return predictions[~already_rated].sort_values(ascending=False).head(N)

    

In [92]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')

ratings_sorted = ratings.sort_values("timestamp")

training_data = ratings_sorted[pd.to_datetime(ratings_sorted['timestamp'], unit='s')<datetime(2018, 1, 1, 0, 0, 0)]
simulation_data = ratings_sorted[(pd.to_datetime(ratings_sorted['timestamp'], unit='s')>=datetime(2015, 1, 1, 0, 0, 0)) & 
                                 (pd.to_datetime(ratings_sorted['timestamp'], unit='s')<=datetime(2025, 6, 1, 0, 0, 0))]
simulation_users = pd.DataFrame({'userId': list(set(simulation_data['userId'])-set(training_data['userId'])), 
                                 'follow':0,
                                 'total':0})
#simulation_users = pd.DataFrame({'userId': list(set(simulation_data['userId'])), 
#                                 'follow':0,
#                                 'total':0})
simulation_users = simulation_users.set_index('userId')
movieList = sorted(movies['movieId'].unique())
userList = sorted(ratings['userId'].unique())

full_matrix = pd.DataFrame(0.0 , index=userList, columns=movieList)
pivot_matrix = training_data.pivot(index='userId', columns='movieId', values='rating')
full_matrix.update(pivot_matrix, overwrite=True)

print(ratings_sorted.shape)
print(training_data.shape)
print(simulation_data.shape)
print(simulation_users.shape)

siz = simulation_data.shape[0]

simulation_data.head()

(100836, 4)
(94418, 4)
(27935, 4)
(29, 2)


Unnamed: 0,userId,movieId,rating,timestamp
70178,448,105954,3.0,1420150263
68361,443,64957,4.0,1420236273
68353,443,4993,4.5,1420236297
68350,443,2762,3.5,1420236300
68347,443,1704,4.0,1420236319


In [94]:
# setting
k1 = 100
k2 = 15
method = "user" # user or item

i = 1
for _ , row in simulation_data.iterrows():
    i += 1
    user_id = int(row['userId'])
    movie_id = row['movieId']
    rating = row['rating']
    print(f"{round(i/siz*100,2)}%\t",end="\r")
    if user_id in simulation_users.index:
        if method.lower()=="item":
            recommendation = collaborative_filtering_item(full_matrix, user_id,k1)
        else:
            recommendation = collaborative_filtering_user(full_matrix, user_id,k1)
        if movie_id in recommendation:
            simulation_users.loc[user_id, 'follow'] += 1
        simulation_users.loc[user_id, 'total'] += 1
    full_matrix.loc[user_id,movie_id] = rating
simulation_users.head(30)


100.0%	

Unnamed: 0_level_0,follow,total
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
514,4,397
258,0,25
519,0,26
272,0,31
153,1,179
25,0,26
417,0,67
296,0,27
556,0,32
306,0,112


In [None]:
#simulation_users.to_csv("simulation.csv")

In [None]:
# compute tag genome
tag_genome = pd.read_csv('../data/ml-latest-small/genome-scores.csv')
tag_genome.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.032
1,1,2,0.02225
2,1,3,0.07
3,1,4,0.059
4,1,5,0.123


In [95]:
def computeAverageDistance(recommendation):
    # Edge case: fewer than 2 movies
    if len(recommendation) < 2:
        return 0  # No meaningful distance to compute

    total_distance = 0
    num_pairs = 0

    # Pre-filter tag genome data for movies in the recommendation list
    filtered_tags = tag_genome[tag_genome['movieId'].isin(recommendation)].set_index('movieId')

    # Compute pairwise distances for unique movie pairs
    for i, m1 in enumerate(recommendation):
        for m2 in recommendation[i + 1:]:  # Ensure unique pairs (m1 < m2)
            # Extract tag vectors for the two movies
            #print(m1,m2)

            if m2 in filtered_tags.index and m1 in filtered_tags.index:
                vec1 = filtered_tags.loc[m1].values
                vec2 = filtered_tags.loc[m2].values
                    
                # Compute Euclidean distance
                distance = np.linalg.norm(vec1 - vec2)

                # Update totals
                total_distance += distance
                num_pairs += 1

    # Return the average distance
    return total_distance / num_pairs if num_pairs > 0 else 0

In [None]:
#simulation_user = pd.read_csv('simulation_item_with_similarity_1.csv')

In [96]:
def computeDiversity(id):
    if method.lower()=="item":
        recommendation = collaborative_filtering_item(full_matrix, id,k2)
    else:
        recommendation = collaborative_filtering_user(full_matrix, id,k2)
    return computeAverageDistance(recommendation)

simulation_users['userId']=simulation_users.index

simulation_users['diversity'] = simulation_users['userId'].apply(computeDiversity)

simulation_users.head(50)


Unnamed: 0_level_0,follow,total,userId,diversity
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
514,4,397,514,5.758075
258,0,25,258,5.760152
519,0,26,519,5.820309
272,0,31,272,5.224762
153,1,179,153,5.088434
25,0,26,25,5.499204
417,0,67,417,4.989015
296,0,27,296,5.361119
556,0,32,556,5.268702
306,0,112,306,5.917535


In [None]:
#simulation_users.to_csv("simulation_user_100_100_with_similarity.csv")