In [1]:
!pip install pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy

n_users = 6040
n_movies = 3952

def get_user_data():
    return pd.read_csv('movielens/users.dat', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], sep='::', engine='python')

def get_movie_data():
    return pd.read_csv('movielens/movies.dat', header=None, names=['movie_id', 'title', 'genre'], sep='::', engine='python')

def get_rating_data():
    return pd.read_csv('movielens/ratings.dat', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], sep='::', engine='python')

def split_train_val_test(ratings, train=0.8, val=0.1):
    shuffled = np.random.RandomState(0).permutation(ratings.index)
    n_train = int(len(shuffled) * train)
    n_val = int(len(shuffled) * val)
    i_train, i_val, i_test = shuffled[:n_train], shuffled[n_train: n_train + n_val], shuffled[-n_val:]
    return ratings.loc[i_train], ratings.loc[i_val], ratings.loc[i_test]

def get_dense_array(ratings_df):
    ratings = np.zeros((n_users, n_movies))
    ratings[ratings_df['user_id'] - 1, ratings_df['movie_id'] - 1] = ratings_df['rating']
    return ratings



In [2]:
users = get_user_data()
movies = get_movie_data()
ratings = get_rating_data()
train_ratings_df, val_ratings_df, test_ratings_df = split_train_val_test(ratings)
train_ratings, val_ratings, test_ratings = get_dense_array(train_ratings_df), get_dense_array(val_ratings_df), get_dense_array(test_ratings_df)

# 4.1 Singular Value Thresholding

In [None]:
# Code for Singular Value Thresholding 

# Let's first interpolate our ratings and fill in with mean value
mu = np.mean(train_ratings[train_ratings!=0])
A = copy.deepcopy(train_ratings)
A[A==0] = mu

# Now center our data for USVT
A_centered = (A-3)/2

# Initialize output data structure
A_hat = np.zeros(A_centered.shape)

# Compute SVD
u,s,v = np.linalg.svd(A_centered,full_matrices=False)
print("Shape of u: {}".format(u.shape))
print("Shape of s: {}".format(np.diag(s).shape))
print("Shape of v: {}".format(v.shape))


# Compute components of threshold tau
n = A_centered.shape[0]
p = np.count_nonzero(train_ratings) / (train_ratings.shape[0]*train_ratings.shape[1])

# Collect MSE/hyperparam vals
MSEs = []
taus = []

# Pick random hyperparam values to iterate over
eta_vals = np.random.random(size=50)
eta_vals.sort()

# Iterate over hyperparameter values
for eta in eta_vals:

    print("Eta is: {}".format(eta))
    
    # Set hyperparameters for Universal SVT
    tau = (2+eta)*np.sqrt(n*p)

    # Threshold and sum to compute matrix estimate
    s_thresh = np.multiply(s,s>tau)
    A_hat = u @ np.diag(s_thresh) @ v # Matrix estimate

    # Clip values of estimated matrix
    A_hat[A_hat < -1] = -1
    A_hat[A_hat > 1] = 1

    # Set matrix on 1-5 scale, as in original case
    A_hat = 2*A_hat + 3

    # Now compute MSE for SVT matrix and interpolated matrix
    nz_indices = val_ratings!=0 # Binary mask

    # Compute MSE for SVT matrix and save value
    MSE_SVT = np.mean(np.square(np.subtract(A_hat[nz_indices],val_ratings[nz_indices])))
    MSEs.append(MSE_SVT)
    taus.append(tau)
    
# Compute MSE for interpolated matrix
MSE_INTERP = np.mean(np.square(np.subtract(mu,val_ratings[nz_indices])))

# Now graph results
plt.plot(taus,MSEs)
plt.xlabel("Thresholding Value (Tau)")
plt.ylabel("MSE")
plt.title("MSE of SVT Estimate as a Function of Thresholding Value Tau")
plt.show()
plt.clf()

In [None]:
# Let's try for arbitrary values of Tau

# Let's first interpolate our ratings and fill in with mean value
mu = np.mean(train_ratings[train_ratings!=0])
A = copy.deepcopy(train_ratings)
A[A==0] = mu

# Now center our data for USVT
A_centered = (A-3)/2

# Initialize output data structure
A_hat = np.zeros(A_centered.shape)

# Compute SVD
u,s,v = np.linalg.svd(A_centered,full_matrices=False)
print("Shape of u: {}".format(u.shape))
print("Shape of s: {}".format(np.diag(s).shape))
print("Shape of v: {}".format(v.shape))


# Compute components of threshold tau
n = A_centered.shape[0]
p = np.count_nonzero(train_ratings) / (train_ratings.shape[0]*train_ratings.shape[1])

# Collect MSE/hyperparam vals
MSEs = []
taus = []

# Pick random hyperparam values to iterate over
eta_vals = np.random.random(size=50)
eta_vals.sort()

# Iterate over hyperparameter values
for tau in [0,0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 25, 100]:

    print("Eta is: {}".format(tau))
    
    # Threshold and sum to compute matrix estimate
    s_thresh = np.multiply(s,s>tau)
    A_hat = u @ np.diag(s_thresh) @ v # Matrix estimate

    # Clip values of estimated matrix
    A_hat[A_hat < -1] = -1
    A_hat[A_hat > 1] = 1

    # Set matrix on 1-5 scale, as in original case
    A_hat = 2*A_hat + 3

    # Now compute MSE for SVT matrix and interpolated matrix
    nz_indices = val_ratings!=0 # Binary mask

    # Compute MSE for SVT matrix and save value
    MSE_SVT = np.mean(np.square(np.subtract(A_hat[nz_indices],val_ratings[nz_indices])))
    MSEs.append(MSE_SVT)
    taus.append(tau)
    
# Compute MSE for interpolated matrix
MSE_INTERP = np.mean(np.square(np.subtract(mu,val_ratings[nz_indices])))

# Now graph results
plt.plot(taus,MSEs)
plt.xlabel("Thresholding Value (Tau)")
plt.ylabel("MSE")
plt.title("MSE of SVT Estimate as a Function of Thresholding Value Tau")
plt.show()
plt.clf()

In [None]:
# Let's try for more arbitrary values of Tau

# Let's first interpolate our ratings and fill in with mean value
mu = np.mean(train_ratings[train_ratings!=0])
A = copy.deepcopy(train_ratings)
A[A==0] = mu

# Now center our data for USVT
A_centered = (A-3)/2

# Initialize output data structure
A_hat = np.zeros(A_centered.shape)

# Compute SVD
u,s,v = np.linalg.svd(A_centered,full_matrices=False)
print("Shape of u: {}".format(u.shape))
print("Shape of s: {}".format(np.diag(s).shape))
print("Shape of v: {}".format(v.shape))


# Compute components of threshold tau
n = A_centered.shape[0]
p = np.count_nonzero(train_ratings) / (train_ratings.shape[0]*train_ratings.shape[1])

# Collect MSE/hyperparam vals
MSEs = []
taus = []

# Pick random hyperparam values to iterate over
eta_vals = np.random.random(size=50)
eta_vals.sort()

# Iterate over hyperparameter values
for tau in [2*i for i in range(50)]:

    print("Tau is: {}".format(tau))
    
    # Threshold and sum to compute matrix estimate
    s_thresh = np.multiply(s,s>tau)
    A_hat = u @ np.diag(s_thresh) @ v # Matrix estimate

    # Clip values of estimated matrix
    A_hat[A_hat < -1] = -1
    A_hat[A_hat > 1] = 1

    # Set matrix on 1-5 scale, as in original case
    A_hat = 2*A_hat + 3

    # Now compute MSE for SVT matrix and interpolated matrix
    nz_indices = val_ratings!=0 # Binary mask

    # Compute MSE for SVT matrix and save value
    MSE_SVT = np.mean(np.square(np.subtract(A_hat[nz_indices],val_ratings[nz_indices])))
    MSEs.append(MSE_SVT)
    taus.append(tau)
    
# Compute MSE for interpolated matrix
MSE_INTERP = np.mean(np.square(np.subtract(mu,val_ratings[nz_indices])))

# Now graph results
plt.plot(taus,MSEs)
plt.xlabel("Thresholding Value (Tau)")
plt.ylabel("MSE")
plt.title("MSE of SVT Estimate as a Function of Thresholding Value Tau")
plt.show()
plt.clf()

In [69]:
# Now we can compute test MSE for optimal hyperparam

# Let's first interpolate our ratings and fill in with mean value
mu = np.mean(train_ratings[train_ratings!=0])
A = copy.deepcopy(train_ratings)
A[A==0] = mu

# Now center our data for USVT
A_centered = (A-3)/2

# Initialize output data structure
A_hat = np.zeros(A_centered.shape)

# Compute SVD
#u,s,v = np.linalg.svd(A_centered,full_matrices=False)
print("Shape of u: {}".format(u.shape))
print("Shape of s: {}".format(np.diag(s).shape))
print("Shape of v: {}".format(v.shape))


# Compute components of threshold tau
n = A_centered.shape[0]
p = np.count_nonzero(train_ratings) / (train_ratings.shape[0]*train_ratings.shape[1])

# Set hyperparameter
tau = 25
    
# Threshold and sum to compute matrix estimate
s_thresh = np.multiply(s,s>tau)
A_hat = u @ np.diag(s_thresh) @ v # Matrix estimate

# Clip values of estimated matrix
A_hat[A_hat < -1] = -1
A_hat[A_hat > 1] = 1

# Set matrix on 1-5 scale, as in original case
A_hat = 2*A_hat + 3

# Sanity check
print(np.amax(A_hat), np.amin(A))
print(np.amax(test_ratings), np.amin(test_ratings[test_ratings!=0]))

# Now compute MSE for SVT matrix and interpolated matrix
nz_indices = test_ratings!=0 # Binary mask for test ratings

# Compute MSE for SVT matrix (test)
MSE_SVT = np.mean(np.square(np.subtract(A_hat[nz_indices],test_ratings[nz_indices])))

    
# Compute MSE for interpolated matrix (test)
MSE_INTERP = np.mean(np.square(np.subtract(mu,test_ratings[nz_indices])))

print("Validation Mean Squared Error using interpolated matrix: {}".format(MSE_INTERP))
print("Test Mean Squared Error using tau = {}: {}".format(tau, MSE_SVT))

Shape of u: (6040, 3952)
Shape of s: (3952, 3952)
Shape of v: (3952, 3952)
5.0 1.0
5.0 1.0
Validation Mean Squared Error using interpolated matrix: 1.2456092571485262
Test Mean Squared Error using tau = 25: 1.0033262138317458


# 4.2 Alternating Least Squares

In [60]:
# Code for Alternating Least Squares

# First, set hyperparameter value k and max_epochs
k = 5
max_epochs = 10000

# Get shape of matrix we're trying to estimate
n,m = train_ratings.shape

# First, we need to initialize U and V
U = np.random.random(size=((n,k)))
V = np.random.random(size=((m,k)))

# Now we iterate until U and V converge
for epoch in range(max_epochs):
    # First, perform gradient update
    loss = 


# 4.3 Collaborative Filtering

In [3]:
import pickle

# Code for Collaborative Filtering
def cosine_similarity(a, b, ratings):
    ra, rb = ratings[a], ratings[b]
    # Code to get common ids
    a_ids = list(ra.keys())
    b_ids = list(rb.keys())

    a_set = set(list(ra.keys()))
    b_set = set(list(rb.keys()))
    
    common_ids = list(a_set.intersection(b_set))
    
    mu_a = np.mean([ra[common_ids[i]] for i in range(len(common_ids))])
    mu_b = np.mean([rb[common_ids[i]] for i in range(len(common_ids))])
    
    ra_p = np.array([ra[common_ids[i]] for i in range(len(common_ids))]) - mu_a
    rb_p = np.array([rb[common_ids[i]] for i in range(len(common_ids))]) - mu_b

    # Get cosine angle
    nz_check = len(np.nonzero(np.linalg.norm(ra_p)*np.linalg.norm(rb_p))[0])
    if nz_check:
        return np.dot(ra_p,rb_p)/(np.linalg.norm(ra_p)*np.linalg.norm(rb_p))
    else:
        return 0
        

def make_rating_dict(ratings):
    # Get matrix of ratings such that R[a] maps to a dictionary of {id: R_a(id)}.
    user_ids = ratings["user_id"].to_numpy()
    movie_ids = ratings["movie_id"].to_numpy()
    rating_ids = ratings["rating"].to_numpy()
    user_set = list(set(user_ids))
    R = {user_set[i]:{} for i in range(len(user_set))}

    counter = 0
    for user_id, movie_id, rating_id in zip(user_ids, movie_ids, rating_ids):
        if counter % 100000 == 0:
            print("Iterated through {} ratings".format(counter))
        R[user_id][movie_id] = rating_id
        counter += 1
    return R, user_set

def make_similarity_matrix(R, user_set):
    pair_counter = 0
    sim = np.zeros((len(user_set),len(user_set)))
    for a in user_set:
        for b in user_set: # Users are most similar to themselves
            if a == b:
                pass
            if pair_counter % 1000000 == 0:
                print("Iterated through {} pairs of users".format(pair_counter))
            sim[a-1][b-1] = cosine_similarity(a, b, R)
            pair_counter += 1
    np.save("SIM_MATRIX.npy", sim, allow_pickle=True)
    return sim



In [None]:
# Make rating dict and similarity matrix
R, user_set = make_rating_dict(ratings)
S = make_similarity_matrix(R, user_set)
print("SIMILARITY MATRIX SAVED AS: SIM_MATRIX.NPY")

Iterated through 0 ratings
Iterated through 100000 ratings
Iterated through 200000 ratings
Iterated through 300000 ratings
Iterated through 400000 ratings
Iterated through 500000 ratings
Iterated through 600000 ratings
Iterated through 700000 ratings
Iterated through 800000 ratings
Iterated through 900000 ratings
Iterated through 1000000 ratings
Iterated through 0 pairs of users
Iterated through 1000000 pairs of users
Iterated through 2000000 pairs of users
Iterated through 3000000 pairs of users
Iterated through 4000000 pairs of users
Iterated through 5000000 pairs of users
Iterated through 6000000 pairs of users
Iterated through 7000000 pairs of users
Iterated through 8000000 pairs of users
Iterated through 9000000 pairs of users
Iterated through 10000000 pairs of users
Iterated through 11000000 pairs of users
Iterated through 12000000 pairs of users
Iterated through 13000000 pairs of users
Iterated through 14000000 pairs of users
Iterated through 15000000 pairs of users
Iterated thr

In [None]:
# Import for plotting
import matplotlib.pyplot as plt

# Now make code for collaborative filtering
def collaborative_filtering(train, sim, k=100):
    n,m = train.shape
    R_hat = np.zeros((n,m))
    for user in range(n): # User-specific code
        
        # Take specific row of matrix for user
        user_sim = sim[a]
        
        # Find top k most similar users to user in question        
        k_indices = list(np.argsort(user_sim))[:k]
        
        # Now we can iterate over movies
        for movie in range(m):
            if train[user][movie] != 0:
                R[hat] = train[user][movie]
            # Get indices for similar people who have rated the movie and average
            average = np.sum(np.multiply(train[k_indices][movie], 
                                         train[k_indices][movie] != 0))/np.nonzero(train[k_indices][movie])
            R_hat[user][movie] = average
            
    return R_hat

# Function for computing Mean Squared Error
def compute_MSE(R_hat,test_ratings):
    # Binary mask for test/validation ratings
    nz_indices = test_ratings!=0 

    # Compute MSE for Collaborative Filtering matrix (validation)
    MSE_SVT = np.mean(np.square(np.subtract(R_hat[nz_indices],test_ratings[nz_indices])))
    return MSE_SVT

##########################
#HYPERPARAMS
##########################
K = [1,2,5,10,15,20,25,40,50,75,100,125,150,200,250,300,400,500,600,700]

# Now load train ratings and similarity matrix
train = train_ratings
val = val_ratings
test = test_ratings
sim = np.load("SIM_MATRIX.npy")

ks = []
val_MSEs = []
test_MSEs = []

# Iterate over different values of our hyperparameter
for k in K:
    R_hat = collaborative_filtering(train, sim, k)
    # Now get validation MSE
    val_MSE = compute_MSE(R_hat, val)
    test_MSE = compute_MSE(R_hat, test)

    # Append values for later plotting
    ks.append(k)
    val_MSEs.append(val_MSE)
    test_MSEs.append(test_MSE)
    print("k: {}, val_MSE: {}, test_MSE: {}".format(k, val_MSE, test_MSE))

# Print out ks, val_MSEs, test_MSEs
print("Hyperparameter values are: {}".format(ks))
print("Validation errors are: {}".format(val_MSEs))
print("Test errors are: {}".format(test_MSEs))


        

In [None]:
# Now plot validation error
plt.plot(ks,val_MSEs)
plt.xlabel("Number of similar users (k)")
plt.ylabel("Validation Error (MSE)")
plt.title("Validation Error of Collaborative Filtering as a Function of k")
plt.show()
plt.clf()

# Now plot test error
plt.plot(ks,test_MSEs)
plt.xlabel("Number of similar users (k)")
plt.ylabel("Test Error (MSE)")
plt.title("Test Error of Collaborative Filtering as a Function of k")
plt.show()
plt.clf()


# 4.4 Neural Network

In [None]:
# Code for Neural Networks

# Import packages
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class LensNet(nn.Module):
    def __init__(self, in_params=100, dr=0.5):
        super(LensNet, self).__init__()
        self.drop = nn.Dropout(dr)
        self.fc1 = nn.Linear(in_params, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 8)
        self.fc4 = nn.Linear(8,4)
        self.fc5 = nn.Linear(4,2)
        self.fc6 = nn.Linear(2,1)
        
    def forward(self, features):
        x = torch.tensor(features)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x



# Test neural net
"""
Net = LensNet(in_params=10, dr=0.5)
x = torch.tensor(np.random.randint(0, high=5, size=10))
result = Net.forward(x)
print(result)
"""

# Now get user and movie data
user_data = []
with open("movielens/users.dat") as f:
    for l in enumerate(f):
        user_data.append(l[1].split("::"))
        
movie_data = []
with open("movielens/movies.dat", encoding="ISO-8859-1") as f:
    for l in enumerate(f):
        movie_data.append(l[1].split("::"))

    
rating_data = []
with open("movielens/ratings.dat") as f:
    for l in enumerate(f):
        rating_data.append(l[1].split("::"))
        
print(rating_data)


