In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

# Data manipulation
We load the data, ignoring timestamp

In [2]:
dataframe = pd.read_csv("ml-100k/u.data", delimiter="\t", names=["userId", "itemId", "rating", "timestamp"], usecols=["userId", "itemId", "rating"])

We split the data into training and test data

In [3]:
X = dataframe[["userId", "itemId"]]
y = dataframe["rating"]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

# Learning
2a) Collaborative filtering fits the data best, as we do not have any content data. We only have information about the user and their ratings

Subtract the movie and user means from the training data (the pre-processing step from the slides).

In [9]:
train = pd.concat([X_train, y_train], axis=1)
movie_mean = train.groupby(["itemId"])["rating"].mean()
user_mean = train.groupby(["userId"])["rating"].mean()
avg_rating = train["rating"].mean()
# preprocess
train["rating"] = train.apply(lambda x: x.rating - movie_mean[x.itemId] - user_mean[x.userId] + avg_rating, axis=1)

3.53014925373


2c) Construct a matrix factorization CF model (a.k.a. “Funk-SVD”) for this training data. Use between 10 and 50 latent factors.

In [36]:
# We want to find A, B such that R ~ AB
def matrix_factorization(R, movie_count, user_count, K, learning_rate=0.001):
    # shape(A) = len(movies), K
    A = np.ones((movie_count + 1, K))
    
    # shape(B) = K, len(users)
    B = np.ones((K, user_count + 1))
    for x in range(0, 10000):
        random_sample = R.sample(1)
        sgd(random_sample, A, B, K, learning_rate)
        
    return (A, B)
        
def sgd(random_sample, A, B, K, learning_rate, weight_decay=0.647):
    u = random_sample.userId.values[0]
    m = random_sample.itemId.values[0]
    Rmu = random_sample.rating.values[0]
    
    sum_count = 0
    for i in range(0, K):
        sum_count += A[m, i] * B[i, u]
    
    for k in range(0, K):
        # update A
        A[m, k] += learning_rate * (Rmu - sum_count) * B[k, u] - weight_decay * A[m, k]
        # update B
        B[k, u] += learning_rate * A[m, k] * (Rmu - sum_count) - weight_decay * B[k, u]

max_movie_id = X["itemId"].max()
max_user_id = X["userId"].max()
A, B = matrix_factorization(train, max_movie_id, max_user_id, 40)

model = np.dot(A, B)

#Add the "obvious" structure back to the model
for (movie,user), rating in np.ndenumerate(model):
    movie_val = movie_mean.get(movie, default=0)
    user_val = user_mean.get(user, default=0)
    value = movie_val + user_val - avg_rating
    model[movie, user] += value

2d) This technique would scale good when data increases. The model can be calculated offline, and then updated periodically. The prediction is as simple as indexing a matrix.

# Scoring
3a)

In [19]:
def predict(model, user_id, movie_id):
    return model[movie_id, user_id]
predict(model, 196, 242)

4.0507856518098091

In [37]:
test = pd.concat([X_test, y_test], axis=1)
y_pred = test.apply(lambda x: predict(model, x.userId, x.itemId), axis=1)


MSE = mean_squared_error(y_true = y_test.values, y_pred = y_pred)
MSE
MSE**(0.5)

1.0928296327092968