#MLib

##Download Data & Extract data

In [None]:
%%bash
cd data/meetup/movielens
rm -rf *
wget http://www.grouplens.org/system/files/ml-100k.zip

unzip -j ml-100k.zip "ml-100k/u.data" 
unzip -j ml-100k.zip "ml-100k/u.item" 
unzip -j ml-100k.zip "ml-100k/u.user" 
unzip -j ml-100k.zip "ml-100k/README"

ls -lh

In [None]:
%%bash 
cat data/meetup/movielens/README 
head -n 5 data/meetup/movielens/u.item

##Unpersonalized Recommendation Using KMeans
Give the movie metadata one simply idea to generate movie recommendation is to cluster movies. One simple clustering technique is KMeans. You can find more information about KMeans over [here](https://en.wikipedia.org/wiki/K-means_clustering). Also there is a cool visualization to understand how KMeans work over [here](http://shabal.in/visuals/kmeans/2.html). Below we will do following steps:
1. Load dataset: We will use [namedtuple](https://docs.python.org/2/library/collections.html#collections.namedtuple) to help track of different elements of our data
2. Learn cluster of movies using KMeans algorithm. 
3. Assign names to each cluster. For this we will simply find top 3 tags related to each cluster and use them as cluster name
4. Generate HTML Page. We will pull poster for each title using IMDB API


###Load Dataset

In [190]:
import collections
Movie = collections.namedtuple('Movie', ['title', 'genres'], verbose=False)
# Movie = collections.namedtuple('Movie', ['title', 'genres'], verbose=True)

def parseMovieData(record, type="binary"):
    """
        Parse movie data
    """
    tokens = record.split('|')
    title = tokens[1].strip()
    genres = [int(y) for y in tokens[6:]]
    return Movie(title, genres)

item = sc.textFile("data/meetup/movielens/u.item").map(parseMovieData)
for i in item.take(5):
    print i

Movie(title=u'Toy Story (1995)', genres=[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Movie(title=u'GoldenEye (1995)', genres=[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])
Movie(title=u'Four Rooms (1995)', genres=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])
Movie(title=u'Get Shorty (1995)', genres=[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Movie(title=u'Copycat (1995)', genres=[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])


### Train Model

In [192]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

trainData, testData = item.map(lambda x: x.genres).randomSplit([0.8, 0.2], 100)

# Build the model (cluster the data)
csize = 10
clusters = KMeans.train(trainData, 10, maxIterations=10,
        runs=10, initializationMode="random", seed=100)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = testData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error for Cluster Size {0} = {1}".format(csize, WSSSE))

Within Set Sum of Squared Error for Cluster Size 10 = 190.455799945


### Generate Movie Clusters

In [203]:
def center(point):
    return clusters.predict(point)

labeledData = item.map(lambda x: (center(x.genres), x)).cache()
for cid, cnt in labeledData.countByKey().items():
    print "Cluster ID: {cid}, Number of Titles: {cnt}".format(cid=cid, cnt=cnt)

Cluster ID: 0, Number of Titles: 230
Cluster ID: 1, Number of Titles: 84
Cluster ID: 2, Number of Titles: 136
Cluster ID: 3, Number of Titles: 160
Cluster ID: 4, Number of Titles: 50
Cluster ID: 5, Number of Titles: 78
Cluster ID: 6, Number of Titles: 96
Cluster ID: 7, Number of Titles: 311
Cluster ID: 8, Number of Titles: 80
Cluster ID: 9, Number of Titles: 457


In [None]:
# Print Cluster
for cluster in labeledData.groupByKey().collect():
    print "========== CLUSTER ID = {0} ==========".format(cluster[0])
    for idx, movie in enumerate(cluster[1]):
        print "{0}: {1}".format(idx+1, movie.title.encode('utf8')) 

### Name Cluster


In [303]:
import itertools

GENRES = [x.strip() for x in "Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western".split('|')]
IndexToGenres = sc.broadcast(GENRES)

def getGenres(record):
    label, movie = record
    output = []
    for idx, g in enumerate(movie.genres):
        if g: 
            output.append((label, IndexToGenres.value[idx]))
    return output

counts = (labeledData
           .flatMap(getGenres)
           .map(lambda x: ((x[0], x[1]), 1))
           .countByKey())

clusters = [[key[0], key[1], value] for key, value in counts.items()]
sortedClusters = sorted(clusters, key=lambda x: x[0])
clusterNames = [None] * 10
for grp in itertools.groupby(sortedClusters, key=lambda x: x[0]):
    clusterNames[grp[0]] = sorted(list(grp[1]), key=lambda x: x[2], reverse=True)[0:3]

for cluster in labeledData.groupByKey().collect():
    name = " ".join([x[1] for x in clusterNames[cluster[0]]])
    print "========== CLUSTER ID = {0} ==========".format(name)
    for idx, movie in enumerate(list(cluster[1])[0:5]):
        print "{0}: {1}".format(idx+1, movie.title.encode('utf8'))     
    

1: Four Rooms (1995)
2: Seven (Se7en) (1995)
3: Usual Suspects, The (1995)
4: Crumb (1994)
5: Hoop Dreams (1994)
1: Net, The (1995)
2: Strange Days (1995)
3: Star Wars (1977)
4: Stargate (1994)
5: Jurassic Park (1993)
1: GoldenEye (1995)
2: From Dusk Till Dawn (1996)
3: Muppet Treasure Island (1996)
4: Rumble in the Bronx (1995)
5: Bad Boys (1995)
1: Copycat (1995)
2: Postino, Il (1994)
3: Angels and Insects (1995)
4: Taxi Driver (1976)
5: Crimson Tide (1995)
1: True Romance (1993)
2: Dirty Dancing (1987)
3: Top Gun (1986)
4: Psycho (1960)
5: Bram Stoker's Dracula (1992)
1: Get Shorty (1995)
2: Babe (1995)
3: Doom Generation, The (1995)
4: Eat Drink Man Woman (1994)
5: Ed Wood (1994)
1: French Twist (Gazon maudit) (1995)
2: I.Q. (1994)
3: While You Were Sleeping (1995)
4: Forrest Gump (1994)
5: Four Weddings and a Funeral (1994)
1: Mighty Aphrodite (1995)
2: Birdcage, The (1996)
3: Brothers McMullen, The (1995)
4: To Wong Foo, Thanks for Everything! Julie Newmar (1995)
5: Billy Madison

In [301]:
import requests
from IPython.display import display, HTML 

def poster(title):
    res = requests.get("http://www.imdbapi.com/?i=&t=" + title)
    poster_url = res.json().get('Poster', None)
    return """
        <div style="float:left; height:100px; width:100px; overflow:hidden;">
            <img src="{0}" alt="{1}" height="75px" width="auto">
        </div>
    """.format(poster_url, title)
    
outStr = "<html><head></head><body style='background-color:black; color:white;'>"
for cluster in labeledData.groupByKey().collect():
    name = " ".join([x[1] for x in clusterNames[cluster[0]]])
    outStr += '<div style="clear:both;height:10px;">&nbsp;</div><h1>CLUSTER ID = {0}</h1>'.format(name)
    for movie in list(cluster[1])[0:10]:
        title = movie.title.split('(')[0]
        outStr += poster(title) 

outStr += "</body></html>"
display(HTML(outStr))
    



### Issues / Future Work
1. We use 10 clusters but didn't realy validated that its the optimal number of clusters. Try running the KMeans algorithm with different number of clusters and find optimal number of clusters by looking at With group sum of squared distances
2. The computation to identify top 3 genres in each cluster is currently happening on a driver side. Can we do it in a distributed fashion. Essentially we need to find complement of [Top](http://pig.apache.org/docs/r0.11.1/func.html#topx) function in Pig. 
3. Rewrite the logic so that its easy to add/remove features


# Personalized Recommendation System -- Experimental (Not Yet Complete)

In [135]:
# Select Equal Number of Samples for each rating
counts = dataset.map(lambda x: (x.label, None)).countByKey()
size = float(min(counts.values()))
broadcastSamplePercentage = sc.broadcast(dict([(k, size/v) for k, v in counts.items()]))

print "Number of Records Per Label: ", counts
print "Percentage of Records For Balanced Dataset: ", broadcastSamplePercentage.value

Number of Records Per Label:  defaultdict(<type 'int'>, {1.0: 6110, 2.0: 11370, 3.0: 27145, 4.0: 34174, 5.0: 21201})
Percentage of Records For Balanced Dataset:  {1.0: 1.0, 2.0: 0.5373790677220757, 3.0: 0.22508749309265058, 4.0: 0.1787908936618482, 5.0: 0.28819395311541907}


In [149]:
def parseRatingData(record, type="simple"):
    """
        Parse Rating data
    """
    tokens = record.split("\t")
    user = int(tokens[0])
    movie = int(tokens[1])
    rating = int(tokens[2])
    if type == "logistic":
        rating = 0 if int(tokens[2]) <= 3 else 1
    return tuple([user, movie, rating])

def parseUserData(record):
    """
        Parse user data
    """
    tokens = record.split('|')
    user = int(tokens[0])
    age = int(tokens[1])
    male = 1 if tokens[2].strip() == 'M' else 0
    female = 1 if tokens[2].strip() == 'F' else 0
    
    return tuple([user, age, male, female])





In [None]:
rating = sc.textFile("data/meetup/movielens/u.data").map(parseRatingData)
user = sc.textFile("data/meetup/movielens/u.user").map(parseUserData)
item = sc.textFile("data/meetup/movielens/u.item").map(parseMovieData)

print rating.take(3)
print user.take(3)
print item.take(3)

##Create Training Dataset

In [134]:
from pyspark.mllib.regression import LabeledPoint


def getLabeledPoint(rating, user, movie = None):
    label = rating[2]
    features = user[1:] 
    if movie:
        features += movie[1:]
        
    return LabeledPoint(label, features)        
        
dataset = (rating
                .map(lambda x: (x[0], x))
                .join(user.map(lambda x: (x[0], x)))
                .map(lambda x: (x[1][0][1], x[1]))
                .join(item.map(lambda x: (x[0], x)))
                .map(lambda x: getLabeledPoint(x[1][0][0], x[1][0][1], x[1][1]))
#                 .map(lambda x: getLabeledPoint(x[1][0][0], x[1][0][1]))
          )
print dataset.take(1)

[LabeledPoint(3.0, [27.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0])]


##Split into training and testing dataset

In [144]:
def predict(record):
    predicted = model.predict(record.features)
    return predicted

predicted = testDataset.map(lambda x: ((x.label, predict(x)), None))
print predicted.countByKey()
print model.weights

defaultdict(<type 'int'>, {(1.0, 0): 10965, (0.0, 0): 8964})
[-1.16431339272,-0.0961499464779,-0.0242242142667,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745,0.0,-0.120374160745]


In [136]:
# Randomly select records based on given sample percentage
import random
random.seed(100)

def balancer(record, percentage):
    """
        Randomly samples records within the same label
        
        @param: record - LabelledPoint
        @param: percnetage - Dictionary where key is same as the label and value indicates percentage of sample to be retained
        
    """
    if percentage == None:
        raise Exception("Percentage cannot be null")
    label = record.label
    return (random.random() <= percentage.get(record.label, -1))

balancedDataset = dataset.filter(lambda x: balancer(x, broadcastSamplePercentage.value))
print "Balanced Labeled Dataset: ", balancedDataset.map(lambda x: (x.label, None)).countByKey()
print "Total Number of Records: ", balancedDataset.count()

Balanced Labeled Dataset:  defaultdict(<type 'int'>, {1.0: 6110, 2.0: 6112, 3.0: 6023, 4.0: 6193, 5.0: 6285})
Total Number of Records:  30447


In [137]:
# Now split the dataset into training and testing
trainDataset, testDataset = balancedDataset.randomSplit([0.9, 0.1], 100)
print "Number of Training Records: ", trainDataset.count()
print "Number of Testing Records: ", testDataset.count()


Number of Training Records:  27530
Number of Testing Records:  3042


## Train Linear Regression Model

In [138]:
from pyspark.mllib.regression import LinearRegressionWithSGD

model = LinearRegressionWithSGD.train(trainDataset, 
                                           iterations=100, 
                                           miniBatchFraction = 1.0, 
                                           regParam = 1e-2,
                                           regType = 'l2',
                                           intercept = False)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weights = model.weights
intercept = model.intercept
print weights, intercept


[-4.42203673316e+227,-9.00843448045e+225,-3.13512595855e+225,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226,0.0,-1.2143560439e+226] 0.0


## Evaluate Model

In [139]:
def predict(record):
    predicted = model.predict(record.features)
    if(predicted < 0): return 0
    return round(predicted, 0)

predicted = testDataset.map(lambda x: ((x.label, predict(x)), None))
cnts = predicted.countByKey()
print cnts

defaultdict(<type 'int'>, {(3.0, 0): 612, (2.0, 0): 632, (1.0, 0): 605, (5.0, 0): 579, (4.0, 0): 617})


#Logistic Regression

In [145]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
rating = sc.textFile("data/meetup/movielens/u.data").map(lambda x: parseRatingData(x, type="logistic"))
dataset = (rating
                .map(lambda x: (x[0], x))
                .join(user.map(lambda x: (x[0], x)))
                .map(lambda x: (x[1][0][1], x[1]))
                .join(item.map(lambda x: (x[0], x)))
                .map(lambda x: getLabeledPoint(x[1][0][0], x[1][0][1], x[1][1]))
#                 .map(lambda x: getLabeledPoint(x[1][0][0], x[1][0][1]))
          )
trainDataset, testDataset = dataset.randomSplit([0.8, 0.2], 100)
# model = LogisticRegressionWithSGD.train(trainDataset)
print trainDataset.map(lambda x: (x.label, None)).countByKey()
print testDataset.map(lambda x: (x.label, None)).countByKey()

defaultdict(<type 'int'>, {0.0: 35661, 1.0: 44410})
defaultdict(<type 'int'>, {0.0: 8964, 1.0: 10965})
