In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import dask.dataframe as dd
from scipy.sparse import coo_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

Collaborative Filtering

In [2]:
review_df = pd.read_csv('data/review_data.csv',low_memory=False)
review_df.head()

Unnamed: 0,review_id,user_id,business_id,review_rating
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0


In [3]:
#Split train and test data for colaborative filtering
train_df, test_df = train_test_split(review_df,test_size=0.2,random_state=42)
print(f"Train data size:{train_df.shape}")
print(f"Test data size:{test_df.shape}")

Train data size:(5592224, 4)
Test data size:(1398056, 4)


In [4]:
# Convert to Dask DataFrame
train_ddf = dd.from_pandas(train_df, npartitions=10)

# Map user_id and business_id to indices
user_mapping = {user: idx for idx, user in enumerate(train_ddf['user_id'].unique())}
business_mapping = {business: idx for idx, business in enumerate(train_ddf['business_id'].unique())}

train_ddf['user_idx'] = train_ddf['user_id'].map(user_mapping)
train_ddf['business_idx'] = train_ddf['business_id'].map(business_mapping)

# Create a sparse matrix
train_sparse_matrix = coo_matrix(
    (train_ddf['review_rating'], (train_ddf['user_idx'], train_ddf['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('user_id', 'float64'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('business_id', 'float64'))



In [5]:
# Perform SVD on the sparse matrix
U, sigma, Vt = svds(train_sparse_matrix, k=100)  # k is the number of latent features

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

print(f"U matrix shape:{U.shape}")
print(f"sigma matrix shape:{sigma.shape}")
print(f"Vt matrix shape:{Vt.shape}")

U matrix shape:(1746593, 100)
sigma matrix shape:(100, 100)
Vt matrix shape:(100, 150344)


In [6]:
#Mapping the users and business for the test set
test_df['user_idx'] = test_df['user_id'].map(user_mapping).fillna(-1).astype(int)
test_df['business_idx'] = test_df['business_id'].map(business_mapping).fillna(-1).astype(int)

In [7]:
def batch_predict(test, U, sigma, Vt, batch_size=1000):
    predictions = []
    for i in range(0, len(test), batch_size):
        batch = test.iloc[i:i + batch_size]
        batch_ratings = []
        for _, row in batch.iterrows():
            user_idx = row['user_idx']
            business_idx = row['business_idx']
            # Predict rating
            predicted_rating = np.dot(np.dot(U[user_idx], sigma), Vt[:, business_idx])
            batch_ratings.append(predicted_rating)
        predictions.extend(batch_ratings)
    return np.array(predictions)

# Perform batch predictions on the test set
test_predictions = batch_predict(test_df, U, sigma, Vt)

print(test_predictions)

[ 1.02702614e-02 -7.50603855e-08  2.02125104e-05 ...  8.30224959e-02
  4.94407700e-04  8.68592168e-02]


In [8]:
def precision_at_k(true_ratings, predicted_ratings, k):
    """
    Calculates Precision@K.

    Parameters:
    - true_ratings: List of true ratings.
    - predicted_ratings: List of predicted ratings.
    - k: Number of top recommendations to consider.

    Returns:
    - Precision@K value.
    """
    # Get top K indices for predicted ratings
    top_k_indices = np.argsort(predicted_ratings)[-k:]
    # Count relevant items (e.g., rating >= 4) in top K
    relevant = sum(1 for i in top_k_indices if true_ratings[i] >= 4)
    return relevant / k

In [9]:
def mean_average_precision(true_ratings, predicted_ratings):
    """
    Calculates Mean Average Precision (MAP).

    Parameters:
    - true_ratings: List of true ratings.
    - predicted_ratings: List of predicted ratings.

    Returns:
    - MAP value.
    """
    # Sort indices by predicted scores in descending order
    sorted_indices = np.argsort(predicted_ratings)[::-1]
    relevant_count = 0
    precision_sum = 0.0

    for i, idx in enumerate(sorted_indices):
        if true_ratings[idx] >= 4:  # Define "relevant" as ratings >= 4
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    return precision_sum / relevant_count if relevant_count > 0 else 0.0

In [10]:
test_true_ratings = test_df['review_rating'].values
# RMSE
rmse = np.sqrt(mean_squared_error(test_true_ratings, test_predictions))
print(f"Test RMSE: {rmse:.4f}")

# Precision@K
k = 5
p_at_k = precision_at_k(test_true_ratings, test_predictions, k)
print(f"Precision@{k}: {p_at_k:.4f}")

# MAP
map_score = mean_average_precision(test_true_ratings, test_predictions)
print(f"Mean Average Precision (MAP): {map_score:.4f}")


Test RMSE: 4.0138
Precision@5: 1.0000
Mean Average Precision (MAP): 0.7178


In [11]:
# Predict ratings for the training set
predicted_ratings = []
for _, row in train_ddf.iterrows():
    user_idx = row['user_idx']
    business_idx = row['business_idx']
    
    # Predict rating using SVD matrices
    predicted_rating = np.dot(np.dot(U[user_idx], sigma), Vt[:, business_idx])
    predicted_ratings.append(predicted_rating)

# Compute RMSE
rmse_train = np.sqrt(mean_squared_error(train_ddf['review_rating'], predicted_ratings))
print(f"Train RMSE: {rmse_train}")



Train RMSE: 3.9378633218042363
