# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import math
import time

### First, read in the similarity matrix. Not all movies have ratings, so we'll ignore the ones that are unrated.

In [2]:
simDataFile = open("mvs.csv", "r")
simData = [ map(float,line.split(',')) for line in simDataFile ]

In [3]:
print len(simData), ' movies have ratings'

10473  movies have ratings


In [4]:
simMat = dict()

for sd in simData:
    simMat[int(sd[0])] = sd[1:] / np.linalg.norm(sd[1:])

In [5]:
np.dot(simMat[1], simMat[4])

0.75932462063717177

### Now, read in the movies.

In [6]:
movies = dict() # a category -> list_of_movies dict stored as integers

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [7]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numCategories = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    if curMovieID not in simMat.keys():
        continue
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numCategories
            catName[numCategories] = cat
            
            numCategories = numCategories + 1
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

    # keep the data-set small for now
#     if numMovies == 3000:
#         break

print "we have", numMovies, "movies"

we have 10473 movies


In [8]:
movies[catID['Horror']][:10]

[12, 22, 70, 92, 93, 152, 177, 183, 188, 196]

# Part II. Build the algorithm in Spark

In [9]:
import findspark
import os

findspark.init()
import pyspark

In [10]:
sc = pyspark.SparkContext()

In [11]:
from novel import wrapper

In [12]:
print numMovies, numCategories

10473 19


In [13]:
novel = wrapper(numMovies, numCategories, 15, 5, simMat, movies);

In [14]:
# start = time.time()
# print sorted(novel(movieName.keys())), time.time() - start

### We will run the algorithm for several stages. At each stage we will shoot for $n$ being around $4 l$. 

In [15]:
start = time.time()

startingMovs = movieName.keys()
goal = 4 * 15

while len(startingMovs) > goal:
    start_iter = time.time()
    
    numPartitions = math.ceil(len(startingMovs) / goal)
    
    allMovs = sc.parallelize(startingMovs, numPartitions) \
                .mapPartitions(novel) \
                .collect()
            
    startingMovs = np.copy(allMovs)
    
    print "Ran a new step with ", numPartitions, " partitions in ", time.time() - start_iter, " seconds"
  
print "Above approach took ", time.time() - start, " seconds"

goodMovs = novel(allMovs)

print 'Solution set = ', sorted(goodMovs)

print 'Total runtime = ', time.time() - start, ' seconds'

Ran a new step with  174.0  partitions in  223.89783287  seconds
Ran a new step with  43.0  partitions in  80.4402859211  seconds
Ran a new step with  10.0  partitions in  27.2010560036  seconds
Ran a new step with  2.0  partitions in  13.7275018692  seconds
Above approach took  345.26769495  seconds
Solution set =  [546, 1907, 2327, 2800, 3054, 4781, 6120, 46062, 46337, 54978, 60522, 62376, 81132, 89002, 97757]
Total runtime =  346.872270823  seconds


In [16]:
for mov in sorted(goodMovs):
    print movieName[mov]

Super Mario Bros. (1993)
Mulan (1998)
Tales from the Darkside: The Movie (1990)
Little Nemo: Adventures in Slumberland (1992)
Pokémon: The First Movie (1998)
Megiddo: The Omega Code 2 (2001)
Q: The Winged Serpent (1982)
High School Musical (2006)
Garfield: A Tail of Two Kitties (2006)
"Good Night, The (2007)"
"Machine Girl, The (Kataude mashin gâru) (2008)"
City of Ember (2008)
Rubber (2010)
Spy Kids: All the Time in the World in 4D (2011)
'Hellboy': The Seeds of Creation (2004)
