In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import sys
import random
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from IPython.display import Image
from IPython.display import display

In [2]:
#Reading ratings file:
dfratedmovies = pd.read_csv('./ratedmovies.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
dfratedmovies.drop(['filename', 'modifiedtitle', 'genres'], axis=1, inplace=True)

In [4]:
dfratedmovies.head()

Unnamed: 0,movieId,title,userId,rating,timestamp
0,1,Toy Story (1995),3,4.0,944919407
1,1,Toy Story (1995),6,5.0,858275452
2,1,Toy Story (1995),8,4.0,833981871
3,1,Toy Story (1995),10,4.0,943497887
4,1,Toy Story (1995),11,4.5,1230858821


In [5]:
n_users = dfratedmovies['userId'].unique().shape[0]
n_items = dfratedmovies['movieId'].unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

138493 users
26744 items


In [6]:
leForItems = preprocessing.LabelEncoder()
leForItems.fit(dfratedmovies['movieId'])

leForUsers = preprocessing.LabelEncoder()
leForUsers.fit(dfratedmovies['userId'])

LabelEncoder()

# User-Item matrix generation

In [7]:
def fillMatrix(row):
    ratings[leForUsers.transform([row['userId']])[0], leForItems.transform([row['movieId']])[0]] = row['rating']
    return

In [8]:
ratings = np.zeros((n_users, n_items))
dfratedmovies.apply(fillMatrix, axis=1)
ratings

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 2. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 3.5,  4. ,  0. , ...,  0. ,  0. ,  0. ]])

### Saving User-Item matrix to the disk

In [None]:
pd.DataFrame(ratings).to_csv('CF_ratings.csv')
# np.save('CF_ratings.npy', ratings)

### Subset selection of User-Item matrix in order to decrease memory requirements

In [3]:
import pandas as pd

In [None]:
df = pd.read_csv('CF_ratings.csv', index_col=0)
# np.load('CF_ratings.npy')

In [None]:
# df = pd.DataFrame(ratings)

In [None]:
# matrix = ratings[:, np.random.choice(ratings.shape[1], 4000, replace=False)] # random 4K items are selected
df = df.sample(4000, axis=1) # random 4K items are selected

In [None]:
df.shape

In [None]:
df_sub = df[(df!=0).sum(axis=1)>20]
df_sub.shape

In [None]:
df_sub = df_sub.sample(4000, axis=0) # random 4K user are selected
df_sub.shape

### Saving reduced User-Item matrix to the disk

In [None]:
df_sub.to_csv('CF_ratings_sub.csv')

## Function Definitions

In [None]:
def train_test_split(ratings):
    test = np.zeros(ratings.values.shape)
    train = ratings.values.copy()
    for user in xrange(ratings.shape[0]):
        test_ratings = np.random.choice(ratings.values[user, :].nonzero()[0], 
                                        size=10,
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings.values[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return pd.DataFrame(train, index=ratings.index), pd.DataFrame(test, index=ratings.index)

### Mean Squared Error (MSE)

In [None]:
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

### Find similarities

In [None]:
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

### Predict similarities

In [None]:
def predict_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [None]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

#### Predict similarities with no bias (to eliminate popular items)

In [None]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [None]:
def predict_topk_nobias(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
        pred += user_bias[:, np.newaxis]
    if kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items])) 
        pred += item_bias[np.newaxis, :]
        
    return pred

## Get the poster for a specific movie

In [None]:
def get_poster(base_url, movie_id):    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': 'eb9700de294fb0f20a755faf91117006'} 
    response = requests.get(movie_url, params=payload, headers=headers)
    try:
        file_path = json.loads(response.text)['posters'][0]['file_path']
    except:
        # IMDB movie ID is sometimes no good. Need to get correct one.
        movie_title = imdb_url.split('?')[-1].split('(')[0]
        payload['query'] = movie_title
        response = requests.get('http://api.themoviedb.org/3/search/movie', params=payload, headers=headers)
        movie_id = json.loads(response.text)['results'][0]['id']
        payload.pop('query', None)
        movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
        response = requests.get(movie_url, params=payload, headers=headers)
        file_path = json.loads(response.text)['posters'][0]['file_path']
        
    return base_url + file_path

### Loading reduced User-Item matrix into the memory

In [None]:
# ratings = np.load('CF_ratings_sub.npy')
ratings = pd.read_csv('CF_ratings_sub.csv', index_col=0)
ratings.shape

In [None]:
sparsity = float(len(ratings.values.nonzero()[0]))
sparsity /= (ratings.values.shape[0] * ratings.values.shape[1])
sparsity = 1-sparsity
sparsity *= 100
print 'Sparsity: {:4.2f}%'.format(sparsity)

## Data splitting

In [None]:
train, test = train_test_split(ratings)

In [None]:
print train.shape
print test.shape

## User-User and Item-Item Similarity Calculations

In [None]:
user_similarity = similarity(train.values, kind='user')
item_similarity = similarity(train.values, kind='item')
print item_similarity[:4, :4]

In [None]:
print user_similarity.shape
print item_similarity.shape

## User-based vs. Item-based prediction comparison

In [None]:
user_prediction = predict_simple(train.values, user_similarity, kind='user')
item_prediction = predict_simple(train.values, item_similarity, kind='item')

print 'User-based CF MSE: ' + str(get_mse(user_prediction, test.values))
print 'Item-based CF MSE: ' + str(get_mse(item_prediction, test.values))

In [None]:
print user_prediction.shape
print item_prediction.shape

# Top-k Collaborative Filtering

In [None]:
user_pred = predict_topk(train.values, user_similarity, kind='user', k=20)
print 'Top-k User-based CF MSE: ' + str(get_mse(user_pred, test.values))

item_pred = predict_topk(train.values, item_similarity, kind='item', k=20)
print 'Top-k Item-based CF MSE: ' + str(get_mse(item_pred, test.values))

### Tuning the k parameter

In [None]:
k_array = np.arange(10, 110, 10)
user_train_mse = []
user_test_mse = []
item_test_mse = []
item_train_mse = []

for k in k_array:
    user_pred = predict_topk(train.values, user_similarity, kind='user', k=k)
    item_pred = predict_topk(train.values, item_similarity, kind='item', k=k)
    
    user_train_mse += [get_mse(user_pred, train.values)]
    user_test_mse += [get_mse(user_pred, test.values)]
    
    item_train_mse += [get_mse(item_pred, train.values)]
    item_test_mse += [get_mse(item_pred, test.values)]

In [None]:
%matplotlib inline
sns.set()

pal = sns.color_palette("Set2", 2)

plt.figure(figsize=(6, 6))
plt.plot(k_array, user_train_mse, c=pal[0], label='User-based train', alpha=0.5, linewidth=4)
plt.plot(k_array, user_test_mse, c=pal[0], label='User-based test', linewidth=4)
plt.plot(k_array, item_train_mse, c=pal[1], label='Item-based train', alpha=0.5, linewidth=4)
plt.plot(k_array, item_test_mse, c=pal[1], label='Item-based test', linewidth=4)
plt.legend(loc='best', fontsize=15)
plt.xticks(fontsize=15);
plt.yticks(fontsize=15);
plt.xlabel('k', fontsize=15);
plt.ylabel('MSE', fontsize=15);

## Bias-subtracted Collaborative Filtering

In [None]:
user_pred = predict_nobias(train.values, user_similarity, kind='user')
print 'Bias-subtracted User-based CF MSE: ' + str(get_mse(user_pred, test.values))

item_pred = predict_nobias(train.values, item_similarity, kind='item')
print 'Bias-subtracted Item-based CF MSE: ' + str(get_mse(item_pred, test.values))

In [None]:
k_array = np.arange(10, 110, 10)
user_train_mse = []
user_test_mse = []
item_test_mse = []
item_train_mse = []

for k in k_array:
    user_pred = predict_topk_nobias(train.values, user_similarity, kind='user', k=k)
    item_pred = predict_topk_nobias(train.values, item_similarity, kind='item', k=k)
    
    user_train_mse += [get_mse(user_pred, train.values)]
    user_test_mse += [get_mse(user_pred, test.values)]
    
    item_train_mse += [get_mse(item_pred, train.values)]
    item_test_mse += [get_mse(item_pred, test.values)]

In [None]:
pal = sns.color_palette("Set2", 2)

plt.figure(figsize=(6, 6))
plt.plot(k_array, user_train_mse, c=pal[0], label='User-based train', alpha=0.5, linewidth=4)
plt.plot(k_array, user_test_mse, c=pal[0], label='User-based test', linewidth=4)
plt.plot(k_array, item_train_mse, c=pal[1], label='Item-based train', alpha=0.5, linewidth=4)
plt.plot(k_array, item_test_mse, c=pal[1], label='Item-based test', linewidth=4)
plt.legend(loc='best', fontsize=15)
plt.xticks(fontsize=15);
plt.yticks(fontsize=15);
plt.xlabel('k', fontsize=15);
plt.ylabel('MSE', fontsize=15);

In [None]:
leForItems.inverse_transform(1)

## Similar Movies

In [None]:
df = pd.read_csv("./ml-20m/links.csv")

In [None]:
df.head()

In [None]:
for i in test.values.nonzero()[1].flatten()[0:2]:
    print df[df['movieId'] == leForItems.inverse_transform(i)]['imdbId']

In [None]:
# Get base url filepath structure. w185 corresponds to size of movie poster.
headers = {'Accept': 'application/json'}
payload = {'api_key': 'eb9700de294fb0f20a755faf91117006'} 
response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

In [None]:
print response.headers

In [None]:
toy_story = 'tt0114709'
Image(url=get_poster(base_url, toy_story))