# Part I. Ingesting data.

Create a dictionary with movies for each genre.

In [1]:
import numpy as np
import math
import time

In [2]:
n = 2000
m = 200
nr = 20

### Read in the ratings. Ratings are in the format - userId,movieId,rating,timestamp

In [3]:
userRatingsFile = open("ratings.csv", "r")
userRatings = [ map(float,line.split(',')) for line in userRatingsFile ]

In [4]:
userRatings[0], userRatings[1]

([1.0, 2.0, 3.5, 1112486027.0], [1.0, 29.0, 3.5, 1112484676.0])

In [5]:
ratings = dict()

for r in userRatings:
    mov = r[1]
    val = r[2]
    
    if mov in ratings:
        ratings[mov].append(val)
    else:
        ratings[mov] = [val]
    
# assign a positive rating only to movies that have at least nr users rating them
for r in ratings:
    if len(ratings[r]) >= nr:
        ratings[r] = np.mean(ratings[r])
    else:
        ratings[r] = 0

In [6]:
ratings[1], ratings[3], ratings[100]

(3.9212395613240769, 3.1510404397330194, 3.2213851761846901)

### Pick the movies with the best $n$ ratings

In [7]:
lowest = sorted(ratings.values())[len(ratings.values()) - n - 1]
print lowest

3.73705722071


In [8]:
bestRatings = dict()

for mov in ratings:
    if ratings[mov] > lowest:
        bestRatings[mov] = ratings[mov]
        
print len(bestRatings.keys())

2000


### Get the users who rated these movies. From their list, pick the $m$ of them that rated most movies

In [9]:
numRatings = dict()

for r in userRatings:
    userID = r[0]
    mov = r[1]
    
    if mov in bestRatings:
        if userID in numRatings:
            numRatings[userID] = numRatings[userID] + 1
        else:
            numRatings[userID] = 0

In [10]:
lowest = sorted(numRatings.values())[len(numRatings.values()) - m]
print lowest

630


In [11]:
mostRatings = dict()

for user in numRatings:
    if numRatings[user] >= lowest:
        mostRatings[user] = numRatings[user]

In [12]:
print 'Number of users:', len(mostRatings)
print 'Some user ids..', mostRatings.keys()[:5]
print 'Number of movies', len(bestRatings)
print 'Some movie ids..', bestRatings.keys()[:5]

Number of users: 200
Some user ids.. [86529.0, 63147.0, 135425.0, 87561.0, 64949.0]
Number of movies 2000
Some movie ids.. [1.0, 8195.0, 8197.0, 6.0, 8199.0]


#### write users ratings to a file to save time next time

In [13]:
procRatings = open("ratingsSummary.csv", "w")

for r in userRatings:
    usr = r[0]
    mov = r[1]
    rating = r[2]
    
    if (usr in mostRatings) and (mov in bestRatings):
        procRatings.write(str(usr) + "," + str(mov) + "," + str(rating) + "\n")
    
procRatings.close()