# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import time

### First, read in the similarity matrix. Not all movies have ratings, so we'll ignore the ones that are unrated.

In [2]:
simDataFile = open("mvs.csv", "r")
simData = [ map(float,line.split(',')) for line in simDataFile ]

In [3]:
print len(simData), ' movies have ratings'

10473  movies have ratings


In [4]:
simMat = dict()

for sd in simData:
    simMat[int(sd[0])] = sd[1:] / np.linalg.norm(sd[1:])

In [5]:
np.dot(simMat[1], simMat[4])

0.75932462063717177

### Now, read in the movies.

In [6]:
movies = dict() # a category -> list_of_movies dict stored as integers

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [7]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numCategories = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    if curMovieID not in simMat.keys():
        continue
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numCategories
            catName[numCategories] = cat
            
            numCategories = numCategories + 1
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

    # keep the data-set small for now
    if numMovies == 2000:
        break

print "we have", numMovies, "movies"

we have 2000 movies


In [8]:
movies[catID['Horror']][:10]

[12, 22, 70, 92, 93, 152, 177, 183, 188, 196]

# Part II. Build the algorithm in Spark

In [9]:
import findspark
import os

findspark.init()
import pyspark

In [10]:
sc = pyspark.SparkContext()

In [11]:
from novel import wrapper

In [12]:
print numMovies, numCategories

2000 19


In [13]:
novel = wrapper(numMovies, numCategories, 7, 3, simMat, movies);

In [14]:
# start = time.time()
# print sorted(novel(movieName.keys())), time.time() - start

In [15]:
start = time.time()

allMovs = sc.parallelize(movieName.keys(), 4) \
    .mapPartitions(novel) \
    .collect()
    
print time.time() - start
    
goodMovs = novel(allMovs)

print sorted(goodMovs), time.time() - start

60.4492821693
3969.80251817
[335, 354, 415, 616, 688, 1342, 2120] 60.6553859711


In [16]:
# for mov in novel(goodMovs):
#     print movieName[mov]