In [4]:
'''
Tested on: Python 3.5.2, numpy 1.14.1, pandas 0.20.3

Please read README for info about the dataset.

**Please note that all code here is optional -- feel free to
use a completely different implementation.**
'''
import numpy as np
import pandas as pd

np.random.seed(42)

def get_user_data():
    '''Returns user info as a Pandas DataFrame,
    where rows are users and columns are the features
    UserID, Gender, Age, Occupation, Zip Code'''
    users_filename = "users.dat"
    return pd.read_csv(users_filename, header=None, sep='::', engine='python')


def get_movie_data():
    '''Returns movie info as a Pandas DataFrame,
    where rows are movies and columns are the features
    MovieID, Title, Genres'''
    movies_filename = "movies.dat"
    return pd.read_csv(movies_filename, header=None, sep='::', engine='python')


def get_rating_data():
    '''Returns rating info as two 2D numpy arrays:
    one for training and one for testing.
    Rows are ratings and columns are
    UserID, MovieID, Rating, Timestamp'''
    ratings_filename = "ratings.dat"
    df = pd.read_csv(ratings_filename, header=None, sep='::', engine='python')
    data = df.ix[:,:2].values
    np.random.shuffle(data)

    train_length = int(df.shape[0] * .8)

    train = data[:train_length]
    test = data[train_length:]
    return train, test

data_train, data_test = get_rating_data()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [5]:
def get_rating_matrix(df, num_users, num_movies):
    '''Given a Pandas Dataframe containing UserID, MovieID, and Rating,
    returns a 2D numpy array Y where Y[UserID][MovieID] = Rating for all entries
    in df, with all other elements equal to None.'''
    # Y = np.
    #train, text = get_ratings_data()

    matrix = np.zeros(shape=(num_users,num_movies))

    max_movie = 0
    max_user = 0

    for row in df:
        user = row[0]
        movie = row[1]
        rating = row[2]
        matrix[user][movie] = rating

        if movie > max_movie:
            max_movie = movie
        if user > max_user:
            max_user = user
    print(max_user, max_movie)


    return matrix
Y_prime = get_rating_matrix(data_train, 6041, 3953)

6040 3952


In [7]:
U,sigma,V=np.linalg.svd(Y_prime)

In [12]:
print(U[0].T.shape)
print(V[0].reshape(3953,1).shape)

print(U[0].reshape(6041,1)@V[0].reshape(3953,1).T)

(6041,)
(3953, 1)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
tau_h=2
A_hat = np.zeros(shape=(6041, 3953))

for i in range(len(V)):
    if sigma[i] > tau_h:
        A_hat += sigma[i]*U[i].reshape(6041,1)@V[i].reshape(3953,1).T