# A simple Popularity Recommendation System for the Jester Dataset!

In [1]:
import numpy as np
import pandas as pd
import cPickle as pickle

### Load the dataset

In [2]:
training_set = pd.read_csv('jester_train.csv')
test_set = pd.read_csv('jester_test.csv')

In [3]:
# Sanity check:
training_set.head()

Unnamed: 0,user_id,joke_id,rating
0,7302,29,7.156
1,61815,46,6.375
2,31128,96,2.281
3,36125,147,-1.781
4,18007,60,2.188


In [4]:
# Sanity check:
test_set.head()

Unnamed: 0,user_id,joke_id
0,30762,24
1,54667,128
2,38515,68
3,44643,39
4,58677,13


In [5]:
# The joke text is NOT used by this recommender, but the following code
# shows how to load the joke text in case you want to use it.
jokes = pickle.load(open('jester_jokes.pkl', 'rb'))

In [6]:
# Sanity checks:
print len(jokes)
print jokes[19]

150
What's the difference between a Macintosh and an Etch-a-Sketch?

You don't have to shake the Mac to clear the screen.


### Compute the average rating of every joke in the training set

We'd expect that there are some jokes which on average are rated very well (high values), and that there are some jokes which on average are rated very poorly (low values). We'll decide that the highly rated jokes are "popular", and that the order of their popularity is in decreasing order of their average rating in the dataset. This decision will be the basis of our popularity recommender.

TODO: We should check to see if every joke is equally represented in this dataset. It's possible that some jokes have only been rated a few times, meaning their average rating may not be reliable as a proxy for popularity. We'll leave this for the next person to work on. :P

In [7]:
avg_ratings = training_set.groupby('joke_id')['rating'].apply(np.mean)

In [8]:
# Sanity checks:

print 'worst rated jokes'
print avg_ratings.sort_values(ascending=True).head()
print

print 'best rated jokes'
print avg_ratings.sort_values(ascending=False).head()
print

worst rated jokes
joke_id
141   -2.792088
124   -2.214896
7     -1.803772
5     -1.601697
16    -1.573728
Name: rating, dtype: float64

best rated jokes
joke_id
105    3.748893
53     3.682430
89     3.585759
35     3.558687
129    3.545080
Name: rating, dtype: float64



### Build the predictions

We'll use the average rating of each joke to predict the future rating of that joke.

In [9]:
# Recall:
test_set.head()

Unnamed: 0,user_id,joke_id
0,30762,24
1,54667,128
2,38515,68
3,44643,39
4,58677,13


In [10]:
# Create the predictions by joining the joke_id of each row in the test set
# with the average rating for that joke, which will be our prediction of the
# future rating of that joke.
test_set_predictions = test_set.join(avg_ratings, on='joke_id')

# Sanity check:
test_set_predictions.head()

Unnamed: 0,user_id,joke_id,rating
0,30762,24,-1.14145
1,54667,128,2.539236
2,38515,68,3.38216
3,44643,39,1.750292
4,58677,13,-0.561356


### Score the predictions

There are two ways to do this:

1. We can write our predictions to a file and run the stand-alone scoring script.

2. We can import the scoring function directly and invoke it.

I'll show both ways below.

In [11]:
# WAY 1

test_set_predictions.to_csv('predictions.csv', index=False)

!python scoring.py predictions.csv

2.60245618197


In [12]:
# WAY 2

from scoring import score_top_5_percent

score_top_5_percent(test_set_predictions)

2.6024561819684955