In [1]:
from surprise import Dataset
from surprise import Reader, KNNWithMeans
from surprise.model_selection import cross_validate
import pandas as pd

In [2]:
jokes = pd.read_csv("jester_items.tsv",sep="\t",
                    names=["ItemID","Joke"])

In [3]:
jokes.shape

(149, 2)

In [4]:
jokes.head()

Unnamed: 0,ItemID,Joke
0,1:,"A man visits the doctor. The doctor says, ""I h..."
1,2:,This couple had an excellent relationship goin...
2,3:,Q. What's 200 feet long and has 4 teeth? A. Th...
3,4:,Q. What's the difference between a man and a t...
4,5:,Q. What's O. J. Simpson's web address? A. Slas...


In [5]:
ratings = pd.read_csv("jester_ratings.csv")
ratings.head()

Unnamed: 0,UserID,ItemID,Rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281
3,1,13,-6.781
4,1,15,0.875


In [6]:
ratings.shape

(1761439, 3)

In [7]:
ratings.ItemID.nunique()

140

In [8]:
#Defining the parser to read data into surprise dateframe
#The parser requires the scale of ratings, and the columns, to be mentioned using rating_scale and line_format
#Limit to 1000 users, to avoid the memory error.
no_of_users = 1000
reader = Reader(line_format= 'user item rating',rating_scale=(-10,10))
data = Dataset.load_from_df(ratings[ratings.UserID < no_of_users],reader)

In [10]:
print(type(reader))
print(type(data))

<class 'surprise.reader.Reader'>
<class 'surprise.dataset.DatasetAutoFolds'>


In [11]:
len(data.df)

38204

In [12]:
sim_parameters = {'name' : 'cosine',
                 'user_based': True}
algo = KNNWithMeans(sim_options=sim_parameters)

In [14]:
cross_validate(algo,data,measures=['RMSE','MAE'],cv=2,verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    4.7994  4.8417  4.8206  0.0211  
MAE (testset)     3.7044  3.7251  3.7147  0.0103  
Fit time          7.19    12.78   9.98    2.80    
Test time         29.73   34.54   32.14   2.41    


{'test_rmse': array([4.79944281, 4.84169627]),
 'test_mae': array([3.70437467, 3.72505404]),
 'fit_time': (7.186546802520752, 12.783195972442627),
 'test_time': (29.72819185256958, 34.54183483123779)}

In [15]:
# Using full data for training
trainset = data.build_full_trainset()
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x25071b10d30>

In [18]:
# Getting data points where predictions can be made
testset = trainset.build_anti_testset()

In [19]:
print(type(testset))

<class 'list'>


In [20]:
print(len(testset))

88496


In [21]:
# Making predictions
predictions = algo.test(testset)

In [22]:
# Fetching top 10 predictions for each user
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


In [23]:
top_n = get_top_n(predictions, n=10)

In [24]:
type(top_n)

collections.defaultdict

In [25]:
top_n.items()

dict_items([(1, [(117, 4.8882114195487105), (129, 4.3611747288837925), (114, 3.9788196841493217), (111, 3.9070119902787024), (126, 3.8458037788307493), (148, 3.7959692697497704), (125, 3.793188507028539), (132, 3.5767393271537795), (143, 3.5673889808035337), (96, 3.2931436669386347)]), (2, [(105, 5.597777465897284), (132, 5.554658803726586), (106, 5.462948105977226), (127, 5.327903166278386), (121, 5.220864891736671), (129, 5.189587159838794), (119, 5.170515869245996), (148, 5.089316888888302), (87, 4.982025188794193), (126, 4.932081449164221)]), (3, [(35, -3.1198343766200063), (127, -3.140919937948558), (106, -3.3247424835386745), (89, -3.362969581456582), (36, -3.8240733070566026), (117, -4.05560501766157), (114, -4.081276240657747), (72, -4.099014718585527), (119, -4.124169796685025), (53, -4.267355330219663)]), (4, [(35, -1.9504786743262628), (32, -2.2522650521827288), (106, -2.5750974668978674), (119, -2.7285668005358943), (138, -2.730413530900151), (53, -2.776008896908724), (114,

In [26]:
take(10, top_n.items())

[(1,
  [(117, 4.8882114195487105),
   (129, 4.3611747288837925),
   (114, 3.9788196841493217),
   (111, 3.9070119902787024),
   (126, 3.8458037788307493),
   (148, 3.7959692697497704),
   (125, 3.793188507028539),
   (132, 3.5767393271537795),
   (143, 3.5673889808035337),
   (96, 3.2931436669386347)]),
 (2,
  [(105, 5.597777465897284),
   (132, 5.554658803726586),
   (106, 5.462948105977226),
   (127, 5.327903166278386),
   (121, 5.220864891736671),
   (129, 5.189587159838794),
   (119, 5.170515869245996),
   (148, 5.089316888888302),
   (87, 4.982025188794193),
   (126, 4.932081449164221)]),
 (3,
  [(35, -3.1198343766200063),
   (127, -3.140919937948558),
   (106, -3.3247424835386745),
   (89, -3.362969581456582),
   (36, -3.8240733070566026),
   (117, -4.05560501766157),
   (114, -4.081276240657747),
   (72, -4.099014718585527),
   (119, -4.124169796685025),
   (53, -4.267355330219663)]),
 (4,
  [(35, -1.9504786743262628),
   (32, -2.2522650521827288),
   (106, -2.5750974668978674),

In [27]:
# Printing top predictions
for uid, user_ratings in take(10,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

1 [117, 129, 114, 111, 126, 148, 125, 132, 143, 96]
2 [105, 132, 106, 127, 121, 129, 119, 148, 87, 126]
3 [35, 127, 106, 89, 36, 117, 114, 72, 119, 53]
4 [35, 32, 106, 119, 138, 53, 114, 76, 93, 143]
5 [119, 129, 143, 132, 148, 105, 126, 127, 110, 117]
6 [32, 127, 105, 119, 148, 138, 126, 69, 117, 129]
7 [119, 148, 129, 105, 132, 126, 110, 104, 111, 47]
8 [148, 129, 127, 119, 126, 111, 117, 143, 114, 110]
9 [130, 132, 27, 105, 119, 94, 69, 29, 53, 91]
10 [126, 127, 148, 114, 129, 111, 143, 110, 150, 121]


In [28]:
# Printing top predictions
for uid, user_ratings in take(10,top_n.items()):
    print("For User",uid)
    for  (iid, _) in user_ratings:
        print(iid)
        print(jokes.loc[int(iid),"Joke"])

For User 1
117
A man goes into a drug store and asks the pharmacist if he can give him something for the hiccups. The pharmacist promptly reaches out and slaps the man's face. "What the heck did you do that for?!" the man screams. "Well, you don't have the hiccups anymore, do you?" The man says, "No I don't, you IDIOT...but my wife out in the car still does!"
129
An old man goes to the doctor for his yearly physical, his wife tagging along. When the doctor enters the examination room, he tells the old man, "I need a urine sample, a stool sample and a sperm sample." The old man, being hard of hearing, looks at his wife and yells: "WHAT? What did he say? What's he want?" His wife yells back, "He needs your underwear."
114
A lady bought a new Lexus. It cost a bundle. Two days later, she brought it back, complaining that the radio was not working. "Madam," said the sales manager, "the audio system in this car is completely automatic. All you need to do is tell it what you want to listen to

KeyError: 'the label [150] is not in the [index]'