In [None]:
# default_exp models.jaccard_similarity

# Jaccard Similarity
> Implementation of jaccard similarity based recommender model.

In [None]:
#export
import pandas as pd
import operator

In [None]:
#export
class JaccardSimilarity:
    """Using the Jaccard Index, we can retrieve top-k similar items to a given item. 
    This provides a way to recommend items of a user which are similar to the 
    items that the user has history with.
    """
    def fit(self, df, user_col='user_id', item_col='item_id'):
        self.item_col = item_col
        self.item_sets = dict((item, set(users)) for item, users in df.groupby(item_col)[user_col])

    def _jaccard(self, item1, item2):
        a = self.item_sets[item1]
        b = self.item_sets[item2]
        intersection = float(len(a.intersection(b)))
        return intersection / (len(a) + len(b) - intersection)

    def recommend(self, itemid, top_k=5):
        jaccard_dict = {x: self._jaccard(x, itemid) for x in self.item_sets}
        ranked_items = sorted(jaccard_dict.items(), key=operator.itemgetter(1), reverse=True)[1:top_k+1]
        return ranked_items

Example

In [None]:
rating_df = pd.read_csv('https://raw.githubusercontent.com/sparsh-ai/rec-data-public/master/ml-other/ml100k_ratings.csv', sep=',', header=0)

In [None]:
model = JaccardSimilarity()

model.fit(df=rating_df, user_col='userId', item_col='movieId')
model.recommend(itemid=10)

[(165, 0.46808510638297873),
 (377, 0.42924528301886794),
 (380, 0.42857142857142855),
 (592, 0.4266666666666667),
 (316, 0.4166666666666667)]