# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import math
import time

### First, read in the similarity matrix. Not all movies have ratings, so we'll ignore the ones that are unrated.

In [2]:
simDataFile = open("mvs.csv", "r")
simData = [ map(float,line.split(',')) for line in simDataFile ]

In [3]:
print len(simData), ' movies have ratings'

10473  movies have ratings


In [4]:
simMat = dict()

for sd in simData:
    simMat[int(sd[0])] = sd[1:] / np.linalg.norm(sd[1:])

In [5]:
np.dot(simMat[1], simMat[4])

0.75932462063717177

### Read in the ratings. We will be working only with the top 1000 movies. Ratings are in the format - userId,movieId,rating,timestamp

In [6]:
ratingsFile = open("procRatings.csv", "r")

ratings = dict()

for line in ratingsFile:
    data = map(float,line.split(','))
    ratings[int(data[0])] = data[1]
    
ratings[1], ratings[2], ratings[3], ratings[30848]

(3.92123956132, 3.21197680169, 3.15104043973, 3.61520190024)

#### Set the ratings of the 300'th movie as a threshold

In [7]:
minRating = sorted(ratings.values(), reverse=True)[99]

### Now, read in the movies.

In [8]:
movies = dict() # a category -> list_of_movies dict stored as integers

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [9]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numCategories = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    if curMovieID not in simMat.keys():
        continue
    if ratings[curMovieID] < minRating:
        continue
    
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numCategories
            catName[numCategories] = cat
            
            numCategories = numCategories + 1
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

print "we have", numMovies, "movies"

we have 100 movies


In [10]:
movies[catID['Horror']][:10]

[593]

### Get the similarity distance matrix

In [11]:
simDist = dict()

for k1 in movieName.keys():
    for k2 in movieName.keys():
        simDist[(k1, k2)] = np.dot(simMat[k1], simMat[k2])

# Part II. Run the algorithms

In [12]:
l = 30
k = 5

print numMovies, numCategories, l, k

100 19 30 5


In [13]:
from novel import wrapper
novel = wrapper(numMovies, numCategories, l, k, simDist, movies);

start = time.time()

novS, novCost, novEvals = novel(movieName.keys())

print "Number of calls = ", novEvals

print "Algorithm runtime = ", time.time() - start

Our solution gives totalCost =  251.010041042
Category  0
858   25.4370721867
1212   0.685539181522
4226   0.348690323272
58559   0.0825852687581
904   0.065836905251
Total cost  26.6197238655 

Category  1
904   13.6110426076
1212   0.0611189691355
4226   0.208805402058
79132   0.0485518613432
908   0.00458893347119
Total cost  13.9341077736 

Category  2
541   20.7261812928
1212   0.464735451369
4226   0.416627371119
58559   0.0566058607828
904   0.0873429380345
Total cost  21.7514929141 

Category  3
1209   19.5383833948
908   0.323571662857
1197   0.343973631076
1196   0.219376341408
58559   0.217030966953
Total cost  20.6423359971 

Category  4
4993   14.5298121764
6016   0.763080633693
1136   0.162964849971
1196   0.114390013877
908   0.224550040324
Total cost  15.7947977143 

Category  5
541   5.54353232624
2571   0.251799756307
94466   0.0661075536662
1196   0.0921210614769
79132   0.0418856820157
Total cost  5.99544637971 

Category  6
2324   19.5342602911
1212   1.65878366351

In [14]:
# # sanity check
# print len(novS)

# for i in range(numCategories):
#     print set(novBstS[i]).issubset(set(novS)), len(novBstS[i]) == k

In [15]:
# from greedysum import gsWrapper

# gs = gsWrapper(numMovies, numCategories, l, k, simDist, movies)

# start = time.time()

# gsS, gsCost, gsEvals = gs(movieName.keys())

# print "Number of calls = ", gsEvals

# print "Algorithm runtime = ", time.time() - start

In [16]:
from greedymerge import gmWrapper

gm = gmWrapper(numMovies, numCategories, l, k, simDist, movies)

start = time.time()

gmS, gmCost, gmEvals = gm(movieName.keys())

print "Number of calls = ", gmEvals

print "Algorithm runtime = ", time.time() - start

Category  0
3468   25.7599992293
4226   0.523646603443
1212   0.148516905654
58559   0.0825852687581
1213   0.0661551983517
Total cost =  26.5809032055 

Category  1
904   13.6110426076
4226   0.208805402058
1212   0.0611189691355
79132   0.0485518613432
1131   0.0177925021294
Total cost =  13.9473113423 

Category  2
1203   21.0984783388
4226   0.33014849819
2186   0.198913025771
58559   0.0566058607828
593   0.0449049375959
Total cost =  21.7290506612 

Category  3
1203   19.8598670966
2571   0.430641586733
1196   0.145773504013
2019   0.112298528881
58559   0.0662541419019
Total cost =  20.6148348582 

Category  4
1203   15.1870637642
1196   0.381138676566
2019   0.0916846745355
4993   0.07865214725
1136   0.0554501279005
Total cost =  15.7939893904 

Category  5
2571   5.68638124535
77658   0.111644983456
1196   0.0921210614769
79132   0.0418856820157
541   0.0319313747077
Total cost =  5.963964347 

Category  6
2859   21.079932198
4226   0.220812373793
950   0.158343499483
92259  

In [17]:
# from localsearch import lsWrapper

# ls = lsWrapper(numMovies, numCategories, l, k, 0.4, simDist, movies)

# start = time.time()

# lsS, lsCost, lsEvals = ls(movieName.keys())

# print "Number of calls = ", lsEvals

# print "Local Search should do O(k m l n^2 log(n)) evaluations = ", \
#     k * numCategories * l * numMovies**2 * np.log(numMovies)

# print "Algorithm runtime = ", time.time() - start