# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import math
import time
import matplotlib.pyplot as plt
import random

from scipy import stats

### Read in the ratings. Ratings are in the format - userId,movieId,rating

In [2]:
ratingsFile = open("ratingsSummary.csv", "r")

userRatings = dict()
relevantMovies = dict()

for line in ratingsFile:
    data = map(float,line.split(','))
    
    usr = int(data[0])
    mov = int(data[1])
    r = int(data[2])
    
    if usr in userRatings:
        userRatings[usr][mov] = r
    else:
        userRatings[usr] = dict()
        userRatings[usr][mov] = r
        
    relevantMovies[mov] = True
    
print len(userRatings.keys()), len(relevantMovies.keys())

200 1995


### Now, read in the movies.

In [3]:
movies = dict() # a category -> list_of_movies dict stored as integers
movieCats = dict() # movieID -> categories movie is part

# catID returns the index of a category of type string. catName returns the name of the category given its ID.
catID = dict()
catName = dict()

# same as with cat
movieID = dict()
movieName = dict()

In [4]:
allData = open("movies.csv", "r")

# this first line contains header info
allData.readline()

numMovies = 0
numGenres = 0

while True:
    line = allData.readline()
    
    if line == '':
        break
    
    curMovieID = int(line.split(",", 1)[0])
    if curMovieID not in relevantMovies.keys():
        continue
    
    curMovieName = (line.split(",", 1)[1]).rsplit(",", 1)[0]
    curCategories = line.rsplit(",", 1)[1].rsplit("\r")[0].split("|")
    
    
    # update catID, catName, movieID, movieName
    movieID[curMovieName] = curMovieID
    movieName[curMovieID] = curMovieName
    
    for cat in curCategories:
        if not (cat in catID):
            catID[cat] = numGenres
            catName[numGenres] = cat
            
            numGenres = numGenres + 1
            
    movieCats[curMovieID] = curCategories
            
    for cat in curCategories:
        if catID[cat] in movies:
            movies[catID[cat]].append(movieID[curMovieName])
        else:
            movies[catID[cat]] = [movieID[curMovieName]]
    
    numMovies = numMovies + 1

print "we have", numMovies, "movies"

we have 1995 movies


In [5]:
print movies[catID['Horror']][:10]
print movieName[movies[catID['Horror']][0]]
print movieCats[movies[catID['Horror']][0]]
print catID.keys()

[593, 1076, 1200, 1214, 1219, 1258, 1261, 1333, 1340, 1348]
"Silence of the Lambs, The (1991)"
['Crime', 'Horror', 'Thriller']
['Mystery', 'Romance', 'Sci-Fi', 'Musical', 'Film-Noir', 'Crime', 'Drama', 'Fantasy', 'Western', 'Animation', 'War', 'Adventure', 'Horror', 'Action', '(no genres listed)', 'Comedy', 'Documentary', 'Children', 'Thriller', 'IMAX']


In [6]:
print 'We have', len(userRatings.keys()), 'users'
print 'We have', len(movieCats), 'movies'
print 'We have', len(catID.keys()), 'genres'

We have 200 users
We have 1995 movies
We have 20 genres


In [7]:
numMovies = len(movieCats)
numUsers = len(userRatings.keys())

print numMovies, numUsers

1995 200


#### compute the weights for each user

In [8]:
weights = dict()
for user in userRatings.keys():
    weights[user] = dict()
    
    for genre in catID.keys():
        weights[user][genre] = 0
        
    tot = 0
        
    for mov in userRatings[user]:
        genres = movieCats[mov]
        
        for genre in genres:
            weights[user][genre] = weights[user][genre] + 1
            tot = tot + 1
            
    for genre in catID.keys():
        weights[user][genre] = 1.0 * weights[user][genre] / tot

    # verify that they sum to 1
    curSum = 0
    for genre in weights[user]:
        curSum = curSum + weights[user][genre]
    assert 0.9999 <= curSum and curSum <= 1.0001, "weights don't sum up to 1"

# Part II. Fix k = 3. Increase l from 3 to 60

In [9]:
k = 3

In [10]:
from replacementGreedy import replacementGreedy
from greedysum import gsWrapper
from greedymerge import gmWrapper

In [11]:
# the functions necessary to compue the plots

def computeCostOuter(userIndex, A):
    tot = 0

    curUser = userRatings.keys()[userIndex]

    # make sure we are only considering movies the current user rated
    ratedA = list(set(A).intersection(userRatings[curUser].keys()))

    catA = dict()
    for cat in catID.keys():
        catA[cat] = []
    for mov in ratedA:
        for cat in movieCats[mov]:
            catA[cat].append(mov)

    for cat in catID.keys():
        # now, find the highest rated movie in each category
        highestRated = 0
        for mov in catA[cat]:
            highestRated = max(highestRated, userRatings[curUser][mov])

        tot = tot + weights[curUser][cat] * highestRated

    return tot

# compute the greedy maximization solution for S for the second stage submodular maximization
def greedyOuter(userIndex, S):
    greedyS = []

    use = [False for s in S]

    for times in range(k):
        # at each step, add the element that gives the greatest marginal gain 

        bestInd = -1
        bestCost = -1

        for ind in range(len(S)):
            if use[ind] == False:
                greedyS.append(S[ind])

                curCost = computeCostOuter(userIndex, greedyS)
                if curCost > bestCost:
                    bestCost = curCost
                    bestInd = ind

                greedyS.pop()

        greedyS.append(S[bestInd])
        use[bestInd] = True

    return computeCostOuter(userIndex, greedyS)

# return the mean, the confidence interval, and the runtime to compute
def computeRatio(S, otherUsers):
    groundSetMean = 0
    for other in otherUsers:
        groundSetMean = groundSetMean + greedyOuter(other, movieName.keys())
    groundSetMean = 1.0 * groundSetMean / len(otherUsers)

    rtS = time.time()
    Sval = []
    for other in otherUsers:
        curVal = greedyOuter(other, S) / groundSetMean
        Sval.append(curVal)
    rtS = time.time() - rtS
    
    print np.mean(Sval)
    
    return np.mean(Sval), \
        stats.norm.interval(0.95, loc=np.mean(Sval), scale=np.std(Sval)/np.sqrt(len(otherUsers))), \
        rtS

In [12]:
#set l
l = 60

rg = replacementGreedy(numMovies, numUsers/2, l, k, \
                            userRatings, weights, movieCats, catID.keys())
rgSs, rgCosts, rgEvals, rgTimes = rg(movieName.keys())

print ''
gs = gsWrapper(numMovies, numUsers/2, l, k, userRatings, weights, movieCats, catID.keys())
gsSs, gsCosts, gsEvals, gsTimes = gs(movieName.keys())

print ''
gm = gmWrapper(numMovies, numUsers/2, l, k, userRatings, weights, movieCats, catID.keys())
gmS, gmCost, gmEvals, gmTime = gm(movieName.keys())

# also, generate a set of random movies, to test the ratios we get against it
randS = random.sample(movieName.keys(), l)

Finished step  2  with cost  347.862098744 ; number of evals 1198282 ; total runtime 66.0293478966
Finished step  3  with cost  359.253561037 ; number of evals 2011176 ; total runtime 120.363067865
Finished step  4  with cost  369.90519365 ; number of evals 2835684 ; total runtime 173.689929008
Finished step  5  with cost  374.869752177 ; number of evals 3657044 ; total runtime 228.1973629
Finished step  6  with cost  378.78850894 ; number of evals 4471876 ; total runtime 281.607676983
Finished step  7  with cost  381.908183562 ; number of evals 5281960 ; total runtime 335.34878397
Finished step  8  with cost  384.461374617 ; number of evals 6091312 ; total runtime 388.470757961
Finished step  9  with cost  386.872590033 ; number of evals 6900336 ; total runtime 440.262492895
Finished step  10  with cost  388.918715987 ; number of evals 7708580 ; total runtime 495.315165043
Finished step  11  with cost  390.569644981 ; number of evals 8514656 ; total runtime 546.634392977
Finished step

In [13]:
# compute the value ratios and runtime ratios for replacement greedy, greedy sum, greedy merge and the random set
otherUsers = random.sample([x for x in range(numUsers/2, numUsers)], 20)

# compute the runtime for greedyMerge
gmTime = time.time()
val = 0
for other in otherUsers:
    val = val + greedyOuter(other, movieName.keys())
gmTime = time.time() - gmTime
print 'Running greedy merge on the ground set takes', gmTime, 'and returns', val, '\n'

ratioRG = []
runtimeRatioRG = []
for S in rgSs:
    mean, ci, rt = computeRatio(S, otherUsers)
    ratioRG.append((mean, ci))
    runtimeRatioRG.append(rt / gmTime)
print 'Done for replacement greedy\n'
    
ratioGS = []
runtimeRatioGS = []
for S in gsSs:
    mean, ci, rt = computeRatio(S, otherUsers)
    ratioGS.append((mean, ci))
    runtimeRatioGS.append(rt / gmTime)
print 'Done for greedy sum\n'

ratioGM = []
runtimeRatioGM = []
for times in range(len(rgSs)):
    ratioGM.append((1, (1,1)))
    runtimeRatioGM.append(1)
#     mean, ci, rt = computeRatio(movieName.keys(), otherUsers)
#     ratioGM.append((mean, ci))
#     runtimeRatioGM.append(rt / gmTime)
print 'Done for greedy merge\n'

ratioRand = []
runtimeRatioRand = []
for times in range(len(rgSs)):
    mean, ci, rt = computeRatio(randS[:(times+k)], otherUsers)
    ratioRand.append((mean, ci))
    runtimeRatioRand.append(rt / gmTime)
print 'Done for random set\n'

Running greedy merge on the ground set takes 5.35930895805 and returns 83.9354964189 

0.843098107074
0.873659338135
0.894818328481
0.902194535995
0.914251237579
0.911064182285
0.917379842559
0.917972724219
0.932194810127
0.930765646223
0.935141724626
0.935576000597
0.935576000597
0.935576000597
0.937334969834
0.937805255778
0.940555987123
0.942503079402
0.942039405569
0.942039405569
0.942039405569
0.938661645407
0.938661645407
0.939470639547
0.939429634352
0.943544022223
0.954142585708
0.956431310638
0.956431310638
0.956105088329
0.956105088329
0.959225580029
0.95965629597
0.95965629597
0.95965629597
0.95965629597
0.95965629597
0.95965629597
0.959927066666
0.959927066666
0.960592648263
0.961747353379
0.961747353379
0.961747353379
0.961747353379
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.962110145393
0.96543154089
0.966034123073
Done for replacement greedy

0.843098107074
0.8719

In [23]:
plt.clf()

ax = plt.subplot(111)

fs = 17

# http://matplotlib.org/users/text_intro.html
ax.set_xlabel('l', fontsize=fs)
ax.set_ylabel('Ratio', fontsize=fs)

plt.ylim(0.1,1.05)
plt.xlim(k, l)

xticks = [k]
for i in range(10, l + 10, 10):
    xticks.append(i)

ax.set_xticks(xticks)

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(fs) 

colors = ['r', 'b', 'g', 'y']
labelNames = ['Replacement Greedy', 'Greedy Sum', 'Greedy Merge', 'Random Selection']

for ind in range(len(colors)-1, -1,-1):
    ratios = [ratioRG, ratioGS, ratioGM, ratioRand][ind]
    
    vals = []
    for i in range(len(ratios)):
        vals.append(ratios[i][0])
        
    errTop = []
    errBot = []
    for i in range(len(ratios)):
        errBot.append(vals[i] - ratios[i][1][0])
        errTop.append(ratios[i][1][1] - vals[i])
        
    plt.plot([i for i in range(k,l+1)], vals, c = colors[ind][0], linewidth=2, label = labelNames[ind])
    
    if labelNames[ind] != 'Greedy Merge':
        plt.errorbar([i for i in range(k,l+1)], vals, yerr = [errBot, errTop], fmt = colors[ind])
    
# http://matplotlib.org/1.3.0/examples/pylab_examples/legend_demo.html
legend = ax.legend(loc='lower right')

# Set the fontsize
for label in legend.get_texts():
    label.set_fontsize(fs)

# plt.show()
plt.savefig("../../writeup/images/movielens-sublinear-exp1-ratio")

plt.close()

In [41]:
for curL in [7,27,57]:
    """
    ========
    Barchart
    ========

    A bar plot with errorbars and height labels on individual bars
    """
    import numpy as np
    import matplotlib.pyplot as plt

    fs = 17
    lfs = 15

    N = 4

    # means = (1.0, 0.1,0.2,0.3)
    # runtimes = (1.0, 0.3,0.2,0.1)

    means = (1.0, ratioRG[curL][0], ratioGS[curL][0], ratioRand[curL][0])
    runtimes = (1.0, runtimeRatioRG[curL], runtimeRatioGS[curL], runtimeRatioRand[curL])

    ind = np.arange(N)  # the x locations for the groups
    width = 0.3       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, means, width, color='r')
    rects2 = ax.bar(ind + width, runtimes, width, color='y')

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Performance', fontsize = lfs)
    # ax.set_title('Scores by group and gender')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(('Greedy\nMerge', 'Replacement\nGreedy', 'Greedy\nSum', 'Random'), fontsize = lfs)
    #                    rotation='vertical')

    ax.legend((rects1[0], rects2[0]), ('Objective Value', 'Runtime'))

    plt.ylim(0,1.3)

    def autolabel(rects):
        """
        Attach a text label above each bar displaying its height
        """

        for rect in rects:
            height = rect.get_height()
            if rect != rects[0] > 0:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '~%.f%%   ' % (100 * height),
                        ha='center', va='bottom')
            else:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '  %.f%%' % (100 * height),
                        ha='center', va='bottom')

    def autolabel2(rects):
        """
        Attach a text label above each bar displaying its height
        """
        for rect in rects:
            height = rect.get_height()
            if rect != rects[0] > 0:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '~%.f%%' % (100 * height),
                        ha='left', va='bottom')
            else:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '%.f%%' % (100 * height),
                        ha='left', va='bottom')

    autolabel(rects1)
    autolabel2(rects2)

    # plt.show()

    plt.savefig("../../writeup/images/movielens-sublinear-exp1-barplot-" + str(curL))

In [43]:
for curL in [7,27,57]:
    """
    ========
    Barchart
    ========

    A bar plot with errorbars and height labels on individual bars
    """
    import numpy as np
    import matplotlib.pyplot as plt

    fs = 17
    lfs = 15

    N = 4

    eps = 0.005
    means = (max(1 - ratioRG[curL][0], eps), 1 - ratioGS[curL][0], 1 - ratioRand[curL][0], eps)
    runtimes = (max(runtimeRatioRG[curL], eps), runtimeRatioGS[curL], runtimeRatioRand[curL], 1.0)
    
    print means
    
    ind = np.arange(N)  # the x locations for the groups
    width = 0.3       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, means, width, color='r')
    rects2 = ax.bar(ind + width, runtimes, width, color='y')

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Performance', fontsize = lfs)
    # ax.set_title('Scores by group and gender')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(('Replacement\nGreedy', 'Greedy\nSum', 'Random', 'Greedy\nMerge'), fontsize = lfs)
    #                    rotation='vertical')

    ax.legend((rects1[0], rects2[0]), ('Loss', 'Runtime'), loc='upper left')

    plt.ylim(0,1.2)

    def autolabel(rects):
        """
        Attach a text label above each bar displaying its height
        """

        for rect in rects:
            height = rect.get_height()
            if rect != rects[0] > 0:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '~%.f%%   ' % (100 * height),
                        ha='center', va='bottom')
            else:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '  %.f%%' % (100 * height),
                        ha='center', va='bottom')

    def autolabel2(rects):
        """
        Attach a text label above each bar displaying its height
        """
        for rect in rects:
            height = rect.get_height()
            if rect != rects[0] > 0:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '~%.f%%' % (100 * height),
                        ha='left', va='bottom')
            else:
                ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                        '%.f%%' % (100 * height),
                        ha='left', va='bottom')

    autolabel(rects1)
    autolabel2(rects2)

    # plt.show()

    plt.savefig("../../writeup/images/movielens-sublinear-exp1-barplot-loss-" + str(curL + k))

(0.082027275781207498, 0.097684634810453419, 0.51999724004288483, 0.005)
(0.043568689361834778, 0.043751616847824915, 0.30060073138593923, 0.005)
(0.033965876927237182, 0.045351308614314467, 0.20662820574161556, 0.005)


In [19]:
# store locally
filename = open('../data/movielens-exp1.txt', 'w')

def writeInfo(text, values, rt):
    print>>filename, text
    for ind in range(len(values)):
        print>>filename, str(values[ind][0]), str(values[ind][1][0]), str(values[ind][1][1]), str(rt[ind])

writeInfo('RG',ratioRG, runtimeRatioRG)
writeInfo('GS',ratioGS, runtimeRatioGS)
writeInfo('GM',ratioGM, runtimeRatioGM)
writeInfo('Rand',ratioRand, runtimeRatioRand)