# Recommender Systems Baseline

Notebook covers the user-based KNN and SVD recommender systems to compare against the Social Media Recommender Systems

In [30]:
import numpy as np
import pandas as pd
import json
from surprise import Dataset, Reader, KNNBasic, SVD, AlgoBase, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split, LeaveOneOut
from collections import Counter

## Data loading and exploration

In [2]:
df = pd.read_csv('./data/user-items-ratings-improved.csv', 
                 names=['user_id', 'repo_id', 'rating'], 
                 skiprows=1)
df.head()

Unnamed: 0,user_id,repo_id,rating
0,0x00evil,SwiftGGTeam/the-swift-programming-language-in-...,2
1,0x00evil,atom/atom,3
2,0x00evil,capistrano/capistrano,3
3,0x00evil,git/git,3
4,0x00evil,golang/go,3


In [3]:
df[(df['user_id'] == '0x00evil') & df['rating']]

Unnamed: 0,user_id,repo_id,rating
1,0x00evil,atom/atom,3
2,0x00evil,capistrano/capistrano,3
3,0x00evil,git/git,3
4,0x00evil,golang/go,3
6,0x00evil,rails/rails,1
10,0x00evil,torvalds/linux,1


In [4]:
n_users = df.user_id.unique().shape[0]
n_repos = df.repo_id.unique().shape[0]

print(f'Total obs: {df.shape[0]}')
print(f'Total users: {n_users}')
print(f'Total repos: {n_repos}')

Total obs: 9184
Total users: 1162
Total repos: 272


## Prep data for Rec Sys

Data has already been loaded in as a sparse matrix, we collect the per repo rankings, that is those repositories with most stars in order.

In [5]:
repo_rankings = df.groupby(['repo_id']).rating.sum().sort_values(ascending=False)
repo_rankings.head()

repo_id
twbs/bootstrap                            340
EbookFoundation/free-programming-books    296
nodejs/node-v0.x-archive                  224
d3/d3                                     221
sindresorhus/awesome                      204
Name: rating, dtype: int64

In [6]:
# Alter the ratings? - this has to be done to compute cosine sims, the distance remains the same so it shouldnt be a problem
temp = df.copy()
# temp['rating'] = temp['rating']+1

temp['user_id'] = temp['user_id'].astype('category')
temp['repo_id'] = temp['repo_id'].astype('category')

# ulabels, ulevels = pd.factorize(temp['user_id'])
# rlabels, rlevels = pd.factorize(temp['repo_id'])
temp = pd.DataFrame({
    'user_name': temp['user_id'],
    'user_id': temp['user_id'].cat.codes,
    'repo_name': temp['repo_id'],
    'repo_id': temp['repo_id'].cat.codes,
    'rating': temp['rating']
})

temp.head()

Unnamed: 0,user_name,user_id,repo_name,repo_id,rating
0,0x00evil,0,SwiftGGTeam/the-swift-programming-language-in-...,28,2
1,0x00evil,0,atom/atom,53,3
2,0x00evil,0,capistrano/capistrano,64,3
3,0x00evil,0,git/git,110,3
4,0x00evil,0,golang/go,115,3


# CUSTOM RECOMMENDER

Open triads recommender here.

In [40]:
class FollowingOnlyAlgorithm(AlgoBase):
    """This rec algo takes in the repositories that a followed users have starred at gives a 2.0 if present else 1.0"""
    
    def __init__(self):
        AlgoBase.__init__(self)
        

    def estimate(self, u, i):
        """Update to previous, now we only calc those users we know"""
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            raw_uid = self.trainset.to_raw_uid(u)
            raw_iid = self.trainset.to_raw_iid(i)
            
            # Extract the original user name and repo name
            uname = list(temp[temp['user_id'] == raw_uid].user_name)[0]
            rname = list(temp[temp['repo_id'] == raw_iid].repo_name)[0]
            following_repos = self.get_following_set(uname)
            
            # If the user is following someone that stars the current repo (raw_iid) then return 1, else 0
            if rname in following_repos:
                return np.clip(following_repos[uname], 1, 3)
            else:
                return 0
        
        return self.trainset.global_mean

    
    def get_following_set(self, uname):
        """Finds the egonet of following for a given user and returns the set of those"""
        curr_following = None
        try:
            
            with open('./data/egonets/following/'+uname+'.json') as fp:
                curr_following = json.load(fp)
        except FileNotFoundError:
            print(f'Error: unable to find file for {raw_uid}.')
            return set()
        except Exception as e:
            print(f'ERROR: failed on raw uid: {raw_uid}')
            print(e)
            return set()


        following_repos = []
        for f in curr_following['nodes']:
            repos = f['starredRepositories']['nodes']
            following_repos += repos

        following_repos = Counter([repo['nameWithOwner'] for repo in following_repos])
        return following_repos


## Model Evaluation

> Note the below code was derived and adapted from COSC2933.


In [42]:
import itertools
from surprise import accuracy
from collections import defaultdict


class EData:
    def __init__(self, data, rankings):
        # Generate entire training set for evaluating properties 
        self.full_trainset = data.build_full_trainset()
        self.full_anti_testset = self.full_trainset.build_anti_testset()
        
        # Set rankings
        self.rankings = rankings
        
        # Generate a train (75) / test (25) split for measuring accuracy
        self.trainset, self.testset = train_test_split(data, test_size=0.25, random_state=1)
        
        # Create leave-one-out train/test split for eval of top-N recs
        # and we create an anti-test-set for generating predictions
        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(data):
            self.LOOCVTrain = train
            self.LOOCVTest = test
        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
        
        # Now compute sim matrix between users to measure diversity
        self.simAlgo = KNNBaseline(
            sim_options={'name': 'cosine', 'user_based': True}
        )
        self.simAlgo.fit(self.full_trainset)
        
        
    def get_full_trainset(self):
        return self.full_trainset
    
    def get_full_anti_testset(self):
        return self.full_anti_testset
    
    def get_trainset(self):
        return self.trainset
    
    def get_testset(self):
        return self.testset
    
    def get_LOOCV_testset(self):
        return self.LOOCVTest
    
    def get_LOOCV_trainset(self):
        return self.LOOCVTrain
    
    def get_LOOCV_anti_testset(self):
        return self.LOOCVAntiTestSet
    
    def get_rankings(self):
        return self.rankings
    
    def get_sims(self):
        return self.simAlgo

class Algorithm:
    
    def __init__(self, name, algorithm):
        self.name = name
        self.algo = algorithm
        
    def evaluate(self, eval_data):
        metrics = {}
        
        # Compute accuracy
        print("Evaluating accuracy...")
        self.algo.fit(eval_data.get_trainset())
        preds = self.algo.test(eval_data.get_testset())
        metrics['RMSE'] = Metrics.RMSE(preds)
        metrics['MAE'] = Metrics.MAE(preds)
        
        # Eval top-10 via leave-one-out
        print("Evaluating top-10 with LOOCV..")
        self.algo.fit(eval_data.get_LOOCV_trainset())
        lo_preds = self.algo.test(eval_data.get_LOOCV_testset())
            
        # Generate preds for ratings not in training
        all_preds = self.algo.test(eval_data.get_LOOCV_anti_testset())
      
        # Get top 10 recs per user
        topN_preds = Metrics.get_topN(all_preds, 10)
        
        print('Evaluating rank metrics...')
        # Hit-rate - how often a repo that the user liked was recommended
        metrics['HR'] = Metrics.hit_rate(topN_preds, lo_preds)
        
        # Cumulative-hit-rate
        metrics['CHR'] = Metrics.cumulative_hit_rate(topN_preds, lo_preds)

        metrics["ARHR"] = Metrics.avg_reciprocal_hit_rate(topN_preds, lo_preds)

        print('Computing recs with complete dataset...')
        self.algo.fit(eval_data.get_full_trainset())
        all_preds = self.algo.test(eval_data.get_full_anti_testset())
        topN_preds = Metrics.get_topN(all_preds, 10)
        
        metrics['Coverage'] = Metrics.user_coverage(topN_preds, eval_data.get_full_trainset().n_users)
        metrics['Diversity'] = Metrics.diversity(topN_preds, eval_data.get_sims())
        metrics['Novelty'] = Metrics.novelty(topN_preds, eval_data.get_rankings())
        
        print('Done.')
        
        return metrics
        
        
class Evaluator:
    algorithms = []
    
    def __init__(self, data, rankings):
        self.data = EData(data, rankings)
    
    def add_algorithm(self, name, algorithm):
        self.algorithms.append(Algorithm(name, algorithm))
    
    def evaluate(self):
        results = {}
        for algo in self.algorithms:
            print(f'Evaluating {algo.name}...')
            results[algo.name] = algo.evaluate(self.data)
        
        # Display results
        print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE", "HR", "CHR", "ARHR", "Coverage", "Diversity", "Novelty" ))

        for name, metrics in results.items():
                print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
                        name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["CHR"],  metrics["ARHR"], metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
     
    
    
class Metrics:
    
    def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)
    
    def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)
    
    def get_topN(predictions, n=10):
        top_n = defaultdict(list)
        for uid, iid, actual, estimated, _ in predictions:
            if (estimated >= 1.0): # >1 is min rating
                top_n[int(uid)].append((int(iid), estimated))
                
        for uid, ratings in top_n.items():
            ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[int(uid)] = ratings[:n]
        
        return top_n
        
    def hit_rate(top_n_preds, left_out_preds):
        """Determine the hit-rate (how good) of top-N list"""
        hits = 0
        total = 0
        for lo_pred in left_out_preds:
            lo_uid = lo_pred[0]
            lo_iid = lo_pred[1]
            
            # Check if in top 10
            is_hit = False
            for tn_iid, tn_pred in top_n_preds[int(lo_uid)]:
                if (int(lo_iid) == int(tn_iid)):
                    is_hit = True
                    break
            if is_hit:
                hits += 1
            total += 1
            
        precision = hits / total
        return precision
            
    
    def cumulative_hit_rate(topN_preds, left_out_preds):
        hits = 0
        total = 0
        for lo_uid, lo_iid, actual, estimated, _ in left_out_preds:
            if (actual >= 1.0): # Only look at things the user starred
                is_hit = False
                for tn_iid, tn_pred in topN_preds[int(lo_uid)]:
                    if (int(lo_iid) == tn_iid):
                        is_hit = True
                        break
                if is_hit:
                    hits += 1
                total += 1
        precision = hits/total
        return precision
    
    def avg_reciprocal_hit_rate(topN_preds, left_out_preds):
        S = 0
        total = 0
        for lo_uid, lo_iid, actual, estimated, _ in left_out_preds:
            hit_rank = 0
            rank = 0
            for tn_iid, tn_pred in topN_preds[int(lo_uid)]:
                rank += 1
                if (int(lo_iid) == tn_iid):
                    hit_rank = rank
                    break
            if hit_rank > 0:
                S += 1.0 / hit_rank
            total += 1
        return S / total
    
    def user_coverage(topN_preds, num_users):
        # Calc the percentage of users that have at least 1 good rec
        hits = 0
        for tn_uid in topN_preds.keys():
            is_hit = False
            for tn_iid, tn_pred in topN_preds[tn_uid]:
                if tn_pred >= 2.0: # Todo is 1 correct number to use??
                    is_hit = True
                    break
            if is_hit:
                hits += 1
        return hits / num_users
    
    
    # TODO: NOVELTY AND DIVERSITY ARE NOT WORKING -- most likely due to topN_preds not working??
    def diversity(topN_preds, sim_algorithm):
        n = 0
        total = 0
        mat = sim_algorithm.compute_similarities()
        for uid in topN_preds.keys():
            pairs = itertools.combinations(topN_preds[uid], 2)
            for pair in pairs:
                repo1 = pair[0][0]
                repo2 = pair[1][0]
                if sim_algorithm.trainset.knows_item(repo1) and sim_algorithm.trainset.knows_item(repo2):
                    inner_id1 = sim_algorithm.trainset.to_inner_iid(repo1) # used to be str(repo1)
                    inner_id2 = sim_algorithm.trainset.to_inner_iid(repo2)
                sim = mat[inner_id1][inner_id2]
                total += sim
                n += 1
        return 1 - (total / n) 
    
    def novelty(topN_preds, rankings):
        n = 0
        total = 0
        for uid in topN_preds.keys():
            for rating in topN_preds[uid]:
                iid = rating[0]
                rank = rankings[iid]
                total += rank
                n += 1
        return total / n

In [45]:
# Load and process our data
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(temp[['user_id', 'repo_id', 'rating']], reader)

# setup an evaluator
evaluator = Evaluator(data, temp.groupby(['repo_id']).rating.sum().sort_values(ascending=False))

evaluator.algorithms = []

# USER-BASED KNN
user_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# SVD
svd = SVD()

# Add the algorithms
evaluator.add_algorithm('User KNN', user_knn)
evaluator.add_algorithm('SVD', svd)

# Following Only
evaluator.add_algorithm('Following Only', FollowingOnlyAlgorithm())

evaluator.algorithms


Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


[<__main__.Algorithm at 0x124face20>,
 <__main__.Algorithm at 0x120dc9ca0>,
 <__main__.Algorithm at 0x120dc9370>]

In [46]:
# Run the evals
evaluator.evaluate()

Evaluating User KNN...
Evaluating accuracy...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating top-10 with LOOCV..
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating rank metrics...
Computing recs with complete dataset...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done.
Evaluating SVD...
Evaluating accuracy...
Evaluating top-10 with LOOCV..
Evaluating rank metrics...
Computing recs with complete dataset...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done.
Evaluating Following Only...
Evaluating accuracy...
Evaluating top-10 with LOOCV..
Evaluating rank metrics...
Computing recs with complete dataset...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done.

Algorithm  RMSE       MAE        HR         CHR        ARHR       Coverage   Diversity  Nove