In [1]:
import os
project_name = "reco-tut-mlh"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [3]:
if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

/content/reco-tut-mlh
Initialized empty Git repository in /content/reco-tut-mlh/.git/
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 44 (delta 9), reused 41 (delta 8), pack-reused 0[K
Unpacking objects: 100% (44/44), done.
From https://github.com/sparsh-ai/reco-tut-mlh
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [None]:
!git status

In [None]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

In [4]:
import sys
sys.path.insert(0, './code')

---

# Singular Value Decomposition (SVD & SVD++)

SVD was heavily used in Netflix's Prize Competition in 2009. The grand prize of $1,000,000 was won by BellKor's Pragmatic Chaos. SVD utilizes stochastic gradient descent to attempt to decompose the original sparse matrices into lower ranking user and item factors (matrix factorization). These two matrices are then multiplied together to predict unknown values in the original sparse martix.

SVD++ adds a new  factor, the effect of implicit information instead of just the explicit information.

# Imports

In [6]:
!pip install -q surprise

[K     |████████████████████████████████| 11.8 MB 51 kB/s 
[?25h  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone


In [7]:
import os
import pandas as pd
import surprise

from utils import stratified_split
import metrics

# Prepare data

## Load data

In [8]:
fp = os.path.join('./data/bronze', 'u.data')
raw_data = pd.read_csv(fp, sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
print(f'Shape: {raw_data.shape}')
raw_data.sample(10, random_state=123)

Shape: (100000, 4)


Unnamed: 0,userId,movieId,rating,timestamp
42083,600,651,4,888451492
71825,607,494,5,883879556
99535,875,1103,5,876465144
47879,648,238,3,882213535
36734,113,273,4,875935609
48636,536,213,5,882360704
59566,684,395,2,878762243
44826,608,423,4,880406727
51584,697,628,4,882622016
4368,130,930,3,876251072


## Train test split

In [10]:
train_size = 0.75
train, test = stratified_split(raw_data, 'userId', train_size)

print(f'Train Shape: {train.shape}')
print(f'Test Shape: {test.shape}')
print(f'Do they have the same users?: {set(train.userId) == set(test.userId)}')

Train Shape: (74992, 4)
Test Shape: (25008, 4)
Do they have the same users?: True


# SVD and SVD++

In [11]:
# Drop 'Timestamp' because surprise only takes dataframes with 3 columns in this order: userid, itemid, rating.
surprise_train = surprise.Dataset.load_from_df(train.drop('timestamp', axis=1), reader=surprise.Reader('ml-100k')).build_full_trainset()

# Instantiate models.
svd = surprise.SVD(random_state=0, n_factors=64, n_epochs=10, verbose=True)
svdpp = surprise.SVDpp(random_state=0, n_factors=64, n_epochs=10, verbose=True)
models = [svd, svdpp]

# Fit.
for model in models:
    model.fit(surprise_train)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


## Recommend

In [12]:
all_preds = []
for model in models:
    # Predict ratings for ALL movies for all users
    predictions = []
    users = train['userId'].unique()
    items = train['movieId'].unique()

    for user in users:
            for item in items:
                predictions.append([user, item, model.predict(user, item).est])
    
    predictions = pd.DataFrame(predictions, columns=['userId', 'movieId', 'prediction'])
    
    # Remove movies already seen by users
    # Create column of all 1s
    temp = train[['userId', 'movieId']].copy()
    temp['seen'] = 1

    # Outer join and remove movies that have alread been seen (seen=1)
    merged = pd.merge(temp, predictions, on=['userId', 'movieId'], how="outer")
    merged = merged[merged['seen'].isnull()].drop('seen', axis=1)
    
    all_preds.append(merged)

In [13]:
recommendations = []
for predictions in all_preds:
    # Create filter for users that appear in both the train and test set
    common_users = set(test['userId']).intersection(set(predictions['userId']))
    
    # Filter the test and predictions so they have the same users between them
    test_common = test[test['userId'].isin(common_users)]
    svd_pred_common = predictions[predictions['userId'].isin(common_users)]
    
    if len(set(predictions['userId'])) != len(set(test['userId'])):
        print('Number of users in train and test are NOT equal')
        print(f"# of users in train and test respectively: {len(set(predictions['userId']))}, {len(set(test['userId']))}")
        print(f"# of users in BOTH train and test: {len(set(svd_pred_common['userId']))}")
        continue
        
    # From the predictions, we want only the top k for each user,
    # not all the recommendations.
    # Extract the top k recommendations from the predictions
    top_movies = svd_pred_common.groupby('userId', as_index=False).apply(lambda x: x.nlargest(10, 'prediction')).reset_index(drop=True)
    top_movies['rank'] = top_movies.groupby('userId', sort=False).cumcount() + 1
    
    recommendations.append(top_movies)

# Evaluation metrics

We see how SVD++ performs better than normal SVD in all metrics.

In [14]:
model_metrics = {'svd':{}, 'svd++':{}}
for recommendation, model in zip(recommendations, model_metrics):
    # Create column with the predicted movie's rank for each user.
    top_k = recommendation.copy()
    top_k['rank'] = recommendation.groupby('userId', sort=False).cumcount() + 1  # For each user, only include movies recommendations that are also in the test set
    
    # Metrics.
    precision_at_k = metrics.precision_at_k(top_k, test, 'userId', 'movieId', 'rank')
    recall_at_k = metrics.recall_at_k(top_k, test, 'userId', 'movieId', 'rank')
    mean_average_precision = metrics.mean_average_precision(top_k, test, 'userId', 'movieId', 'rank')
    ndcg = metrics.ndcg(top_k, test, 'userId', 'movieId', 'rank')

    model_metrics[model]['precision'] = precision_at_k
    model_metrics[model]['recall'] = recall_at_k
    model_metrics[model]['MAP'] = mean_average_precision
    model_metrics[model]['NDCG'] = ndcg

In [15]:
for model, values in model_metrics.items():
    print(f'------ {model} -------',
          f'Precision: {values["precision"]:.6f}',
          f'Recall: {values["recall"]:.6f}',
          f'MAP: {values["MAP"]:.6f} ',
          f'NDCG: {values["NDCG"]:.6f}',
          '', sep='\n')

------ svd -------
Precision: 0.093531
Recall: 0.033000
MAP: 0.011672 
NDCG: 0.092656

------ svd++ -------
Precision: 0.108271
Recall: 0.038600
MAP: 0.015655 
NDCG: 0.114023

