In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import datetime
from collections import defaultdict
from math import sqrt
import os
import heapq
from operator import itemgetter

from google.colab import files
uploaded = files.upload()

  import pandas.util.testing as tm


Saving u.data to u.data
Saving u.item.item to u.item.item


In [0]:
items = open('u.item.item','r',encoding="ISO-8859-1")
item_data = items.read()
item_data
items.close()

In [0]:
data = pd.read_csv('u.data',sep = '\t',header=None)
data

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [0]:
def load_reviews(path, **kwargs):
    options = {
        'fieldnames':('userid','movieid','rating','timestamp'),
        'delimiter' : '\t'
    }
    options.update(kwargs)
    parse_date = lambda r,k : datetime.datetime.fromtimestamp(float(r[k]))
    parse_int = lambda r,k : int(r[k])
    with open(path,'r',encoding = "ISO-8859-1") as reviews:
        reader = csv.DictReader(reviews,**options)
        for row in reader:
            row['userid'] = parse_int(row, 'userid') 
            row['movieid'] = parse_int(row, 'movieid') 
            row['rating'] = parse_int(row, 'rating') 
            row['timestamp'] = parse_date(row, 'timestamp') 
            yield row


In [0]:
def relative_path(path):
    dirname = os.path.dirname(os.path.realpath('__file__')) 
    path = os.path.join(dirname, path) 
    return os.path.normpath(path)

In [0]:
def load_movies(path, **kwargs):
    options = {
        'fieldnames': ('movieid', 'title', 'release', 'video', 'url'),
        'delimiter': '|','restkey': 'genre'
    }
    options.update(kwargs)
    parse_int = lambda r,k: int(r[k]) 
    parse_date = lambda r,k: datetime.datetime.strptime(r[k], '%d-%b-%Y') if r[k] else None 
    with open(path, 'r',encoding = "ISO-8859-1") as movies: 
        reader = csv.DictReader(movies, **options) 
        for row in reader: 
            row['movieid'] = parse_int(row, 'movieid') 
            row['release'] = parse_date(row, 'release') 
            row['video'] = parse_date(row, 'video') 
            yield row



In [0]:
class MovieLens(object): 

    def __init__(self, udata, uitem):
        self.udata = udata 
        self.uitem = uitem 
        self.movies = {} 
        self.reviews = defaultdict(dict) 
        self.load_dataset()

    def load_dataset(self):
        for movie in load_movies(self.uitem): 
            self.movies[movie['movieid']] = movie 
        for review in load_reviews(self.udata): 
            self.reviews[review['userid']][review['movieid']] = review
    
    def reviews_for_movie(self, movieid):
        for review in self.reviews.values(): 
            if movieid in review: 
                yield review[movieid]

    def average_reviews(self): 
        for movieid in self.movies: 
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid)) 
            average = sum(reviews) / float(len(reviews)) 
            yield (movieid, average, len(reviews))

    def top_rated(self, n=10): 
        return heapq.nlargest(n, self.bayesian_average(), key=itemgetter(1))

    def bayesian_average(self, c=59, m=3): 
        for movieid in self.movies: 
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid)) 
            average = ((c * m) + sum(reviews)) / float(c + len(reviews)) 
            yield (movieid, average, len(reviews))

    def shared_preferences(self, criticA, criticB): 
        if criticA not in self.reviews: 
            raise KeyError("Couldn't find critic '%s' in data" % criticA) 
        if criticB not in self.reviews: 
            raise KeyError("Couldn't find critic '%s' in data" % criticB) 
        moviesA = set(self.reviews[criticA].keys()) 
        moviesB = set(self.reviews[criticB].keys()) 
        shared = moviesA & moviesB  
        reviews = {} 
        for movieid in shared: 
            reviews[movieid] = ( self.reviews[criticA][movieid]['rating'], 
                                self.reviews[criticB][movieid]['rating'], ) 
        return reviews
    
    def euclidean_distance(self, criticA, criticB): 
        preferences = self.shared_preferences(criticA, criticB) 
        if len(preferences) == 0: 
            return 0 
        sum_of_squares = sum([pow(a-b, 2) for a, b in preferences.values()]) 
        return 1 / (1 + sqrt(sum_of_squares))

    def pearson_correlation(self, criticA, criticB):
        
        preferences = self.shared_preferences(criticA, criticB)
        length = len(preferences)
        if length == 0: return 0
        sumA = sumB = sumSquareA = sumSquareB = sumProducts = 0
        for a, b in preferences.values():
            sumA += a
            sumB += b
            sumSquareA += pow(a, 2)
            sumSquareB += pow(b, 2)
            sumProducts += a*b
        numerator = (sumProducts*length) - (sumA*sumB)
        denominator = sqrt(((sumSquareA*length) - pow(sumA, 2)) *
        ((sumSquareB*length) - pow(sumB, 2)))
        if denominator == 0: return 0
        return abs(numerator / denominator)

    def similar_critics(self, user, metric='euclidean', n=None):
    
        metrics = {
        'euclidean': self.euclidean_distance,
        'pearson': self.pearson_correlation,
        }
        distance = metrics.get(metric, None)
        if user not in self.reviews:
            raise KeyError("Unknown user, '%s'." % user)
        if not distance or not callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric'%s'." % metric)
        critics = {}
        for critic in self.reviews:
            if critic == user:
                continue
            critics[critic] = distance(user, critic)
        if n:
            return heapq.nlargest(n, critics.items(),key=itemgetter(1))
        return critics
    
    def predict_ranking(self, user, movie, metric='euclidean',critics=None):
    
        critics = critics or self.similar_critics(user,metric=metric)
        total = 0.0
        simsum = 0.0
        for critic, similarity in critics.items():
            if movie in self.reviews[critic]:
                total += similarity * self.reviews[critic][movie]['rating']
            simsum += similarity
        if simsum == 0.0: 
            return 0.0
        return total / simsum

    def predict_all_rankings(self, user, metric='euclidean',n=None):

        critics = self.similar_critics(user, metric=metric)
        movies = {
            movie: self.predict_ranking(user, movie, metric,critics) for movie in self.movies
        }
        if n:
            return heapq.nlargest(n, movies.items(),key=itemgetter(1))
        return movies

    def shared_critics(self, movieA, movieB):
        
        if movieA not in self.movies:
            raise KeyError("Couldn't find movie '%s' in data" %movieA)
        if movieB not in self.movies:
            raise KeyError("Couldn't find movie '%s' in data" %movieB)
        criticsA = set(critic for critic in self.reviews if movieA in self.reviews[critic])
        criticsB = set(critic for critic in self.reviews if movieB in self.reviews[critic])
        shared = criticsA & criticsB 
        reviews = {}
        for critic in shared:
            reviews[critic] = (
            self.reviews[critic][movieA]['rating'],
            self.reviews[critic][movieB]['rating'],
            )
        return reviews
    
    def similar_items(self, movie, metric='euclidean', n=None):
        prefs = None
        metrics = {
        'euclidean': self.euclidean_distance,
        'pearson': self.pearson_correlation,
        }
        distance = metrics.get(metric,None)
        if movie not in self.reviews:
            raise KeyError("Unknown movie, '%s'." % movie)
        if not distance or not callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric'%s'." % metric)
        items = {}
        for item in self.movies:
            if item == movie:
                continue
            items[item] = distance(item, movie)
        if n:
            return heapq.nlargest(n, items.items(),key=itemgetter(1))
        return items

    def predict_ranking(self, user, movie, metric='euclidean'):
        movies = self.similar_items(movie, metric=metric)
        total = 0.0
        simsum = 0.0
        for relmovie, similarity in movies.items():
            if relmovie in self.reviews[user]:
                total += similarity *self.reviews[user][relmovie]['rating']
            simsum += similarity
        if simsum == 0.0: return 0.0
        return total / simsum






In [35]:
data = relative_path('u.data') 
item = relative_path('u.item.item') 
model = MovieLens(data, item)
model.euclidean_distance(232, 532)
model.pearson_correlation(232,532)

0.06025793538385047

In [0]:
for mid, avg, num in model.top_rated(10): 
    title = model.movies[mid]['title'] 
    print("[%0.3f average rating (%i reviews)] %s" % (avg, num,title))


[4.234 average rating (583 reviews)] Star Wars (1977)
[4.224 average rating (298 reviews)] Schindler's List (1993)
[4.196 average rating (283 reviews)] Shawshank Redemption, The (1994)
[4.172 average rating (243 reviews)] Casablanca (1942)
[4.135 average rating (267 reviews)] Usual Suspects, The (1995)
[4.123 average rating (413 reviews)] Godfather, The (1972)
[4.120 average rating (390 reviews)] Silence of the Lambs, The (1991)
[4.098 average rating (420 reviews)] Raiders of the Lost Ark (1981)
[4.082 average rating (209 reviews)] Rear Window (1954)
[4.066 average rating (350 reviews)] Titanic (1997)


In [0]:
for item in model.similar_critics(232, 'euclidean', n=10):
    print("%4i: %0.3f" % item)

 688: 1.000
 914: 1.000
 170: 0.500
  78: 0.500
  47: 0.500
 335: 0.500
 341: 0.500
 155: 0.414
 101: 0.414
 309: 0.414


In [0]:
for item in model.similar_critics(232, 'pearson', n=10):
    print("%4i: %0.3f" % item)

 260: 1.000
 155: 1.000
  36: 1.000
 302: 1.000
  33: 1.000
 289: 1.000
 309: 1.000
 317: 1.000
 511: 1.000
 769: 1.000


In [0]:
model.predict_ranking(422, 50, 'euclidean')

2.337352499545245

In [0]:
for mid, rating in model.predict_all_rankings(578, 'pearson',10):
    print("%0.3f: %s" % (rating, model.movies[mid]['title']))

2.644: Star Wars (1977)
2.146: Fargo (1996)
2.121: Return of the Jedi (1983)
2.038: Contact (1997)
1.856: English Patient, The (1996)
1.786: Toy Story (1995)
1.784: Godfather, The (1972)
1.750: Air Force One (1997)
1.746: Scream (1996)
1.710: Raiders of the Lost Ark (1981)
