In [16]:
import pandas as pd
import scipy
from scipy.sparse import csr_matrix, coo_matrix
from tqdm import tqdm
import numpy as np
import implicit
from sklearn.model_selection import KFold
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError

In [2]:
rating_df = pd.read_csv('Dataset/ratings.csv').astype(int)
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3,1112486027
1,1,29,3,1112484676
2,1,32,3,1112484819
3,1,47,3,1112484727
4,1,50,3,1112484580
...,...,...,...,...
20000258,138493,68954,4,1258126920
20000259,138493,69526,4,1259865108
20000260,138493,69644,3,1260209457
20000261,138493,70286,5,1258126944


In [3]:
def create_sparse_matrix(data):
    pivot_table = data.pivot(index='userId', columns='movieId', values='normalized_rating').fillna(0)
    return csr_matrix(pivot_table.values)

def normalize_data(data):
    mean_ratings = data.groupby('userId')['rating'].mean().reset_index(name='mean_rating')
    data = data.merge(mean_ratings, on='userId')
    data['normalized_rating'] = data['rating'] - data['mean_rating']
    data.drop(columns=['rating', 'mean_rating'], inplace=True, errors='ignore')
    return data, mean_ratings

In [4]:
def create_hash_vectors(length: int, count: int) -> list:
    return [np.random.uniform(-1, 1, size=length) for _ in range(count)]
        
def create_hash(vector: list, min_hash):
    result = []
    for hash in min_hash:
        a = np.dot(vector, hash) > 0
        result.append(1 if a else 0)
    return result

In [5]:
def calculate_bucket_number(hash_vector):
    s = ''.join(str(bit) for bit in hash_vector)
    bucket_number = int(s, 2)
    return bucket_number

In [6]:
def create_neighborhood_matrix(new_user_vector, buckets, min_hash, sparse_matrix) -> csr_matrix:
    hash_vector = create_hash(new_user_vector, min_hash)
    bucket_number = calculate_bucket_number(hash_vector)

    if bucket_number in buckets:
        neighbor_users = buckets[bucket_number]
    else:
        return csr_matrix((0, sparse_matrix.shape[1]))
    
    rows = [sparse_matrix[n].toarray()[0] for n in neighbor_users]
    return csr_matrix(np.vstack(rows)) if rows else csr_matrix((0, sparse_matrix.shape[1]))

In [7]:
# def matrix_factorization(sparse_ratings: csr_matrix):
#     model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=64, use_native=True, num_threads=0)
#     model.fit(sparse_ratings.T, show_progress=False)
#     user_factors = model.user_factors
#     item_factors = model.item_factors
#     return item_factors @ user_factors.T

def matrix_factorization(sparse_ratings: csr_matrix, k):
    U, S, V = scipy.sparse.linalg.svds(sparse_ratings, k=k)
    item_factors = U.dot(np.diag(S))
    user_factors = V.T
    factorization = item_factors.dot(user_factors.T)
    factorization = item_factors @ user_factors.T
    return factorization

In [8]:
def pearson_correlation_coefficient(array1, array2):
    if len(array1) != len(array2):
        raise ValueError("Arrays must be of the same length")
    
    mean1 = np.mean(array1)
    mean2 = np.mean(array2)
    
    centered1 = array1 - mean1
    centered2 = array2 - mean2
    
    covariance = np.sum(centered1 * centered2) / len(array1)
    std_dev1 = np.sqrt(np.sum(centered1**2) / len(array1))
    std_dev2 = np.sqrt(np.sum(centered2**2) / len(array2))
    
    if std_dev1 * std_dev2 == 0:
        return 0
    
    pearson_coefficient = covariance / (std_dev1 * std_dev2)
    
    return pearson_coefficient

In [9]:
def predict_ratings(user_vector, similar_users_count, buckets, min_hash, sparse_matrix):
    nm = create_neighborhood_matrix(user_vector, buckets, min_hash, sparse_matrix)
    min_shape = np.min(nm.shape)
    if min_shape == 0:
        return []
    k = int(0.5 * min_shape)
    if k < 1:
        k = 1
    if k == min_shape:
       return [] 
    neighborhood_matrix = matrix_factorization(nm, k)
    
    similarities = np.array([np.abs(pearson_correlation_coefficient(neighborhood_matrix[i], user_vector))
                             for i in range(neighborhood_matrix.shape[0])])
    
    similar_users = similarities.argsort()[::-1]
    user_mean = user_vector[user_vector != 0].mean()
    
    predicts = []
    for item_idx in range(neighborhood_matrix.shape[1]):
        if user_vector[item_idx] == 0:
            predicts.append(0)
            continue
        weighted_ratings_sum = 0
        weights_sum = 0
        users_added = 0
        for user_index in similar_users:
            if users_added == similar_users_count:
                break
            
            user_rating = neighborhood_matrix[user_index, item_idx]
            if user_rating == 0:
                continue
            users_added += 1
            similarity_score = similarities[user_index]
            weighted_ratings_sum += user_rating * similarity_score
            weights_sum += np.abs(similarity_score)
        
        predicted_rating = weighted_ratings_sum / weights_sum if weights_sum > 0 else 0
        predicts.append(predicted_rating + user_mean)
    
    return np.array(predicts)

In [10]:
def df_to_sparse(rating_df, df, key= 'normalized_rating'):
    unique_user_ids = pd.Series(rating_df['userId'].unique()).sort_values()
    user_mapping = pd.Series(index=unique_user_ids, data=range(len(unique_user_ids)))

    unique_movie_ids = pd.Series(rating_df['movieId'].unique()).sort_values()
    movie_mapping = pd.Series(index=unique_movie_ids, data=range(len(unique_movie_ids)))
    
    user_indices = df['userId'].map(user_mapping)
    movie_indices = df['movieId'].map(movie_mapping)
    values = df[key]

    result = coo_matrix((values, (user_indices, movie_indices)), 
                    shape=(len(user_mapping), len(movie_mapping)))
    print(result.shape)
    print(result.count_nonzero())
    return result.tocsr()

In [19]:
unique_users = rating_df['userId'].unique()
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

fold = 1
total_loss = 0
for train_index, val_index in kf.split(unique_users):
    time.sleep(10)
    train_users = unique_users[train_index]
    val_users = unique_users[val_index]
    
    # Filter data based on split users
    train_data = rating_df[rating_df['userId'].isin(train_users)]
    val_data = rating_df[rating_df['userId'].isin(val_users)]

    # Normalize train data and create sparse matrices
    train_data, mean_ratings_train = normalize_data(train_data)
    train_sparse_matrix = df_to_sparse(rating_df, train_data)

    # Normalize val data (if needed) and create sparse matrix
    # val_data, mean_ratings_val = normalize_data(val_data)
    val_sparse_matrix = df_to_sparse(rating_df, val_data, 'rating')

    # Create hash vectors and buckets
    min_hash = create_hash_vectors(train_sparse_matrix.shape[1], 14)
    buckets = {}
    for i in tqdm(range(train_sparse_matrix.shape[0]), desc='Bucketing'):
        user = train_sparse_matrix[i].toarray()[0]
        hash_vector = create_hash(user, min_hash)
        bucket_number = calculate_bucket_number(hash_vector)
        if bucket_number not in buckets:
            buckets[bucket_number] = []
        buckets[bucket_number].append(i)

    desire_indices = []
    for i in range(val_sparse_matrix.shape[0]):
        ith = val_sparse_matrix[i]
        if ith.count_nonzero() > 0:
            desire_indices.append(i)
            
    time.sleep(10)

    loss = 0
    processed_count = 0

    def evaluate_with_timeout(i):
      try:
          with ThreadPoolExecutor() as executor:
              user_vector = val_sparse_matrix[i].toarray()[0]
              future = executor.submit(predict_ratings, user_vector, 20, buckets, min_hash, train_sparse_matrix)
              predicted = future.result(timeout=1)
              if len(predicted) == 0:
                  return None, 0
              mask = user_vector != 0
              actual = user_vector[mask]
              pred = predicted[mask]
              user_loss = np.mean((actual - pred) ** 2)
              return user_loss, 1
      except TimeoutError:
          print(f"Timeout occurred for index {i}")
          return None, 0
      except Exception as e:
          print(f'Error in {i} -> {e}')
          return None, 0

    for i in tqdm(desire_indices, desc='Evaluating'):
        user_loss, count = evaluate_with_timeout(i)
        if user_loss is not None:
            loss += user_loss
            processed_count += count

    loss /= processed_count
    total_loss += loss

    print(f'Validation Loss: {loss}')

average_loss = total_loss / n_splits
print(f'Average Cross-Validation Loss: {average_loss}')