### in this set of experiments, we are given a completed ratings matrix

In [1]:
import numpy as np
import math
import time
import matplotlib.pyplot as plt
import random

from scipy import stats

In [2]:
ratingsFile = open("ratingsSummary.csv", "r")

userRatings = dict()
relevantMovies = dict()

for line in ratingsFile:
    data = map(float,line.split(','))
    
    usr = int(data[0])
    mov = int(data[1])
    r = int(data[2])
    
    if usr in userRatings:
        userRatings[usr][mov] = r
    else:
        userRatings[usr] = dict()
        userRatings[usr][mov] = r
        
    relevantMovies[mov] = True
    
print len(userRatings.keys()), len(relevantMovies.keys())

200 1995


### Now, read in the movies.

In [3]:
movies = dict() # a category -> list_of_movies dict stored as integers
movieCats = dict() # movieID -> categories movie is part

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [4]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numGenres = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    if curMovieID not in relevantMovies.keys():
        continue
    
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numGenres
            catName[numGenres] = cat
            
            numGenres = numGenres + 1
            
    movieCats[curMovieID] = curCategories
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

print "we have", numMovies, "movies"

we have 1995 movies


In [5]:
print movies[catID['Horror']][:10]
print movieName[movies[catID['Horror']][0]]
print movieCats[movies[catID['Horror']][0]]
print catID.keys()

[593, 1076, 1200, 1214, 1219, 1258, 1261, 1333, 1340, 1348]
"Silence of the Lambs, The (1991)"
['Crime', 'Horror', 'Thriller']
['Mystery', 'Romance', 'Sci-Fi', 'Musical', 'Film-Noir', 'Crime', 'Drama', 'Fantasy', 'Western', 'Animation', 'War', 'Adventure', 'Horror', 'Action', '(no genres listed)', 'Comedy', 'Documentary', 'Children', 'Thriller', 'IMAX']


In [6]:
print 'We have', len(userRatings.keys()), 'users'
print 'We have', len(movieCats), 'movies'
print 'We have', len(catID.keys()), 'genres'

We have 200 users
We have 1995 movies
We have 20 genres


In [7]:
numMovies = len(movieCats)
numUsers = len(userRatings.keys())

print numMovies, numUsers

1995 200


### Now, complete the ratings matrix

In [18]:
ratingsIncomplete = np.array([[float('NaN') for i in range(numMovies)] for j in range(numUsers)])

movieList = movieName.keys()
userList = userRatings.keys()

for user in userRatings:
    for mov in userRatings[user]:
        line = userList.index(user)
        col = movieList.index(mov)
        
        ratingsIncomplete[line][col] = userRatings[user][mov]

In [19]:
from fancyimpute import SoftImpute

startTime = time.time()

solver = SoftImpute(
    min_value=0.0,
    max_value=5.0)

ratingsComplete = solver.complete(ratingsIncomplete)

print time.time() - startTime

[SoftImpute] Max Singular Value of X_init = 1123.683327
[SoftImpute] Iter 1: observed MAE=0.522688 rank=196
[SoftImpute] Iter 2: observed MAE=0.537585 rank=193
[SoftImpute] Iter 3: observed MAE=0.548181 rank=177
[SoftImpute] Iter 4: observed MAE=0.548157 rank=153
[SoftImpute] Iter 5: observed MAE=0.544339 rank=134
[SoftImpute] Iter 6: observed MAE=0.539971 rank=116
[SoftImpute] Iter 7: observed MAE=0.535780 rank=104
[SoftImpute] Iter 8: observed MAE=0.532160 rank=93
[SoftImpute] Iter 9: observed MAE=0.528867 rank=85
[SoftImpute] Iter 10: observed MAE=0.526145 rank=78
[SoftImpute] Iter 11: observed MAE=0.523777 rank=74
[SoftImpute] Iter 12: observed MAE=0.521712 rank=71
[SoftImpute] Iter 13: observed MAE=0.519792 rank=68
[SoftImpute] Iter 14: observed MAE=0.518061 rank=65
[SoftImpute] Iter 15: observed MAE=0.516570 rank=62
[SoftImpute] Iter 16: observed MAE=0.515245 rank=61
[SoftImpute] Iter 17: observed MAE=0.514135 rank=61
[SoftImpute] Iter 18: observed MAE=0.513141 rank=60
[SoftImput

In [20]:
ratingsComplete.tofile('completedMatrix.txt', sep=" ", format="%s")

In [21]:
new_data = np.loadtxt('completedMatrix.txt')

print new_data.shape

new_data = new_data.reshape((numUsers,numMovies))

print ratingsIncomplete[0][:100]
print ratingsComplete[0][:100]

(399000,)
[  4.   2.   1.   4.   1.   1.  nan   2.   1.  nan   2.  nan   3.   2.  nan
  nan   2.   5.  nan   2.  nan  nan   1.  nan  nan  nan   3.  nan  nan   1.
  nan  nan   3.   2.  nan  nan   4.  nan   1.  nan  nan   3.  nan  nan  nan
   3.   3.   2.   4.   2.  nan   3.  nan  nan   3.   4.  nan  nan  nan   1.
  nan  nan   4.  nan  nan  nan   4.  nan   4.  nan  nan  nan   1.   4.   4.
   3.  nan   1.   3.   2.  nan   2.   2.   3.  nan   1.   2.   3.  nan  nan
  nan   2.   2.   2.  nan   3.   5.  nan  nan   1.]
[ 4.          2.          1.          4.          1.          1.
  2.72127607  2.          1.          2.25120966  2.          1.57156987
  3.          2.          2.25764631  2.49203325  2.          5.
  1.52421058  2.          2.24577134  1.93828605  1.          1.94505764
  2.48057829  2.25902861  3.          2.47004573  2.27368885  1.
  2.31629372  1.77087709  3.          2.          1.74061402  2.21564699
  4.          2.31416744  1.          2.03823933  2.43110943  3.
  2