# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import math
import time

### First, read in the similarity matrix. Not all movies have ratings, so we'll ignore the ones that are unrated.

In [2]:
simDataFile = open("mvs.csv", "r")
simData = [ map(float,line.split(',')) for line in simDataFile ]

In [3]:
print len(simData), ' movies have ratings'

10473  movies have ratings


In [4]:
simMat = dict()

for sd in simData:
    simMat[int(sd[0])] = sd[1:] / np.linalg.norm(sd[1:])

In [5]:
np.dot(simMat[1], simMat[4])

0.75932462063717177

### Read in the ratings. We will be working only with the top 1000 movies. Ratings are in the format - userId,movieId,rating,timestamp

In [6]:
ratingsFile = open("procRatings.csv", "r")

ratings = dict()

for line in ratingsFile:
    data = map(float,line.split(','))
    ratings[int(data[0])] = data[1]
    
ratings[1], ratings[2], ratings[3], ratings[30848]

(3.92123956132, 3.21197680169, 3.15104043973, 3.61520190024)

#### Set the ratings of the 300'th movie as a threshold

In [7]:
minRating = sorted(ratings.values(), reverse=True)[99]

### Now, read in the movies.

In [8]:
movies = dict() # a category -> list_of_movies dict stored as integers

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [9]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numCategories = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    if curMovieID not in simMat.keys():
        continue
    if ratings[curMovieID] < minRating:
        continue
    
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numCategories
            catName[numCategories] = cat
            
            numCategories = numCategories + 1
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

print "we have", numMovies, "movies"

we have 100 movies


In [10]:
movies[catID['Horror']][:10]

[593]

# Part II. Run the algorithms

In [11]:
l = 10
k = 5

print numMovies, numCategories, l, k

100 19 10 5


In [12]:
from novel import wrapper
novel = wrapper(numMovies, numCategories, l, k, simMat, movies);

start = time.time()

novS, novCost, novEvals, novBstS = novel(movieName.keys())

print "Number of calls = ", novEvals

print "Algorithm runtime = ", time.time() - start

Our solution gives totalCost =  250.059830688
Number of calls =  62358
Algorithm runtime =  4.89803791046


In [13]:
# sanity check
print len(novS)

for i in range(numCategories):
    print set(novBstS[i]).issubset(set(novS)), len(novBstS[i]) == k

10
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True


In [14]:
from greedysum import gsWrapper

gs = gsWrapper(numMovies, numCategories, l, k, simMat, movies)

start = time.time()

gsS, gsCost, gsEvals = gs(movieName.keys())

print "Number of calls = ", gsEvals

print "Algorithm runtime = ", time.time() - start

Greedy Sum gives cost =  249.953779285
Number of calls =  19779
Algorithm runtime =  1.7732899189


In [15]:
from greedymerge import gmWrapper

gm = gmWrapper(numMovies, numCategories, l, k, simMat, movies)

start = time.time()

gmS, gmCost, gmEvals = gm(movieName.keys())

print "Number of calls = ", gmEvals

print "Algorithm runtime = ", time.time() - start

Greedy Merge gives cost =  250.859756776
Size of S is  47
Number of calls =  9329
Algorithm runtime =  0.520768165588


In [16]:
from localsearch import lsWrapper

ls = lsWrapper(numMovies, numCategories, l, k, 0.5, simMat, movies)

start = time.time()

lsS, lsCost, lsEvals = ls(movieName.keys())

print "Number of calls = ", lsEvals

print "Local Search should do O(k m l n^2 log(n)) evaluations = ", \
    k * numCategories * l * numMovies**2 * np.log(numMovies)

print "Algorithm runtime = ", time.time() - start

Picking l random elements gives cost =  248.093638343
Working for step  0
Replaced index  0  with index  1  =>  [1203, 670, 6669, 2324, 4973, 1178, 2186, 6016, 1224, 2571]
Intermediate cost =  249.123229373

Working for step  1
Replaced index  3  with index  28  =>  [1203, 670, 2324, 4973, 1178, 2186, 6016, 1224, 2571, 1196]
Intermediate cost =  249.368145142

Working for step  2
Replaced index  24  with index  3  =>  [1203, 2324, 4973, 1178, 2186, 6016, 1224, 2571, 1196, 6669]
Intermediate cost =  249.48053595

Working for step  3
Replaced index  21  with index  38  =>  [1203, 2324, 4973, 2186, 6016, 1224, 2571, 1196, 6669, 58559]
Intermediate cost =  249.623676981

Working for step  4
Replaced index  1  with index  2  =>  [1203, 2324, 4973, 2186, 6016, 1224, 1196, 6669, 58559, 77658]
Intermediate cost =  249.66980868

Working for step  5
Replaced index  2  with index  17  =>  [1203, 2324, 4973, 2186, 6016, 1224, 1196, 6669, 58559, 1148]
Intermediate cost =  249.73773223

Working for 