In [2]:
import csv
from math import sqrt

In [3]:
def loadMovieLens():
    movies = {}
    with open('movies.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            index = row['movieId']
            title = row['title']
            movies[index] = row['title']

    prefs = {}
    with open('ratings.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userId = row['userId']
            movieId = row['movieId']
            rating = row['rating']
            prefs.setdefault(userId, {})
            prefs[userId][movies[movieId]] = float(rating)
    return prefs

def getMovieID(movie_name):
    for i in range(len(movies)):
        if movie_name in str(movies[i][1]):
            return i
    return False

def simPearson(prefs, p1, p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1

    n = len(si)

    if n==0:
        return 0

    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])

    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])

    pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])

    num = pSum - (sum1*sum2/n)
    den = sqrt((sum1Sq - pow(sum1,2)/n)*(sum2Sq - pow(sum2,2)/n))

    if den==0:
        return 0

    r = num/den
    return r


def getRecommendations(prefs, person, similarity=simPearson):

    totals = {}
    simSums = {}
    for other in prefs:
        #skip Toby
        if other == person:
            continue
        sim = similarity(prefs, person, other)
        if sim <= 0:
            continue
        for item in prefs[other]:
            # Only score movies Toby hasn't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item] * sim
                simSums.setdefault(item, 0)
                simSums[item] += sim
    rankings = [(total / simSums[item], item) for (item, total) in
                totals.items()]
    rankings.sort()
    rankings.reverse()
    return rankings


def topMatches(prefs, person, n=5, similarity=simPearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]

    scores.sort()
    scores.reverse()
    return scores[0:n]


def transformPrefs(prefs):
    '''
    Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}.
    '''

    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result



def calculateSimilarItems(prefs, n=10):
    '''
    Create a dictionary of items showing which other items they are
    most similar to.
    '''

    result = {}
    # Invert the preference matrix to be item-centric
    itemPrefs = transformPrefs(prefs)
    c = 0
    for item in itemPrefs:
        # Status updates for large datasets
        c += 1
        if c % 100 == 0:
            print '%d / %d' % (c, len(itemPrefs))
        # Find the most similar items to this one
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
        result[item] = scores
    return result


def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    # Loop over items rated by this user
    for (item, rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity, item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings:
                continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2, 0)
            scores[item2] += similarity * rating
            # Sum of all the similarities
            totalSim.setdefault(item2, 0)
            totalSim[item2] += similarity
    # Divide each total score by total weighting to get an average
    rankings = [(score / totalSim[item], item) for (item, score) in
                scores.items()]
    # Return the rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings


def sim_distance(prefs, p1, p2):
    '''
    Returns a distance-based similarity score for person1 and person2.
    '''

    # Get the list of shared_items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they have no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[p1][item] - prefs[p2][item], 2) for item in
                         prefs[p1] if item in prefs[p2]])
    return 1 / (1 + sqrt(sum_of_squares))

In [4]:
prefs = loadMovieLens()
print(prefs['999'])

{'Wizard of Oz, The (1939)': 4.0, 'Lion King, The (1994)': 3.5, 'Air Bud (1997)': 3.0, 'Notorious (1946)': 4.5, 'Forrest Gump (1994)': 4.5, 'Home Alone 3 (1997)': 4.5, 'Apollo 13 (1995)': 4.0, 'Free Willy 2: The Adventure Home (1995)': 3.5, 'Oliver! (1968)': 2.5, 'Ace Ventura: Pet Detective (1994)': 4.5, 'Toy Story (1995)': 4.5, 'Matilda (1996)': 3.5}


In [5]:
item = calculateSimilarItems(prefs, n=50)

100 / 9064
200 / 9064
300 / 9064
400 / 9064
500 / 9064
600 / 9064
700 / 9064
800 / 9064
900 / 9064
1000 / 9064
1100 / 9064
1200 / 9064
1300 / 9064
1400 / 9064
1500 / 9064
1600 / 9064
1700 / 9064
1800 / 9064
1900 / 9064
2000 / 9064
2100 / 9064
2200 / 9064
2300 / 9064
2400 / 9064
2500 / 9064
2600 / 9064
2700 / 9064
2800 / 9064
2900 / 9064
3000 / 9064
3100 / 9064
3200 / 9064
3300 / 9064
3400 / 9064
3500 / 9064
3600 / 9064
3700 / 9064
3800 / 9064
3900 / 9064
4000 / 9064
4100 / 9064
4200 / 9064
4300 / 9064
4400 / 9064
4500 / 9064
4600 / 9064
4700 / 9064
4800 / 9064
4900 / 9064
5000 / 9064
5100 / 9064
5200 / 9064
5300 / 9064
5400 / 9064
5500 / 9064
5600 / 9064
5700 / 9064
5800 / 9064
5900 / 9064
6000 / 9064
6100 / 9064
6200 / 9064
6300 / 9064
6400 / 9064
6500 / 9064
6600 / 9064
6700 / 9064
6800 / 9064
6900 / 9064
7000 / 9064
7100 / 9064
7200 / 9064
7300 / 9064
7400 / 9064
7500 / 9064
7600 / 9064
7700 / 9064
7800 / 9064
7900 / 9064
8000 / 9064
8100 / 9064
8200 / 9064
8300 / 9064
8400 / 9064
8

In [7]:
print(item)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
getRecommendedItems(prefs, item, '999')

[(4.5, 'xXx (2002)'),
 (4.5, 'Zero Effect (1998)'),
 (4.5, 'Zeitgeist: The Movie (2007)'),
 (4.5, 'Young Einstein (1988)'),
 (4.5, 'Young Doctors in Love (1982)'),
 (4.5, 'Yes Men Fix the World, The (2009)'),
 (4.5, 'X: The Man with the X-Ray Eyes (1963)'),
 (4.5, 'Written on the Wind (1956)'),
 (4.5, 'Worth Winning (1989)'),
 (4.5, 'Wonderful World of the Brothers Grimm, The (1962)'),
 (4.5, 'Women, The (2008)'),
 (4.5, 'Woman in the Fifth, The (Femme du V\xc3\xa8me, La) (2011)'),
 (4.5, 'Wolf Man, The (1941)'),
 (4.5, 'Without a Paddle (2004)'),
 (4.5, 'Witchfinder General (Conquerer Worm, The) (1968)'),
 (4.5, 'Wind That Shakes the Barley, The (2006)'),
 (4.5, 'Whole Nine Yards, The (2000)'),
 (4.5, "Who's Harry Crumb? (1989)"),
 (4.5, 'White Zombie (1932)'),
 (4.5, 'White Fang (1991)'),
 (4.5, 'Whiplash (2014)'),
 (4.5, "While We're Young (2014)"),
 (4.5, 'When a Stranger Calls (1979)'),
 (4.5, 'Whatever It Takes (2000)'),
 (4.5, 'What Maisie Knew (2012)'),
 (4.5, 'Welcome to L.A. 

In [11]:
topMatches = topMatches(prefs, '999')

In [12]:
print(topMatches)

[(1.000000000000016, '102'), (1.0000000000000107, '671'), (1.000000000000007, '13'), (1.0000000000000018, '522'), (1.0, '85')]


In [13]:
print(prefs['102'])

{'Solaris (Solyaris) (1972)': 5.0, 'Hamlet (1964)': 4.0, 'Rear Window (1954)': 5.0, 'Commitments, The (1991)': 3.0, 'In the Line of Fire (1993)': 3.0, 'Force 10 from Navarone (1978)': 4.0, "Muriel's Wedding (1994)": 4.0, 'Paths of Glory (1957)': 5.0, 'Producers, The (1968)': 5.0, 'Fanny and Alexander (Fanny och Alexander) (1982)': 5.0, 'Exorcist, The (1973)': 5.0, 'Interview with the Vampire: The Vampire Chronicles (1994)': 3.0, 'Mommie Dearest (1981)': 4.0, 'Talented Mr. Ripley, The (1999)': 2.0, 'Best Years of Our Lives, The (1946)': 5.0, 'Halloween H20: 20 Years Later (Halloween 7: The Revenge of Laurie Strode) (1998)': 3.0, 'My Fair Lady (1964)': 5.0, 'Cat on a Hot Tin Roof (1958)': 4.0, 'Barry Lyndon (1975)': 4.0, 'I Know What You Did Last Summer (1997)': 2.0, 'I Shot Andy Warhol (1996)': 4.0, 'Mass Appeal (1984)': 3.0, 'Rocky (1976)': 5.0, 'Atlantic City (1980)': 5.0, 'Gingerbread Man, The (1998)': 4.0, 'Taxi Driver (1976)': 5.0, 'Twin Peaks: Fire Walk with Me (1992)': 3.0, 'M (1

In [14]:
print(prefs['13'])

{'Iron Giant, The (1999)': 4.0, 'City Slickers (1991)': 2.5, 'Shrek (2001)': 4.0, 'Jurassic Park (1993)': 3.0, 'Rain Man (1988)': 4.0, 'Pulp Fiction (1994)': 3.5, 'Donnie Darko (2001)': 4.5, 'John Carter (2012)': 3.0, 'League of Their Own, A (1992)': 3.5, 'Wizard of Oz, The (1939)': 3.5, "Schindler's List (1993)": 4.0, 'Sixth Sense, The (1999)': 3.0, 'Monsters, Inc. (2001)': 3.5, 'Ghost (1990)': 3.0, 'Shanghai Noon (2000)': 3.0, 'Seven (a.k.a. Se7en) (1995)': 2.5, 'Toy Story (1995)': 5.0, 'Toy Story 3 (2010)': 4.0, "Bug's Life, A (1998)": 4.0, 'Eternal Sunshine of the Spotless Mind (2004)': 4.0, 'My Fair Lady (1964)': 4.0, 'Robin Hood: Prince of Thieves (1991)': 3.0, 'Matrix, The (1999)': 3.0, 'Muppet Movie, The (1979)': 3.5, 'Secret Garden, The (1993)': 4.0, 'Gran Torino (2008)': 4.5, 'Braveheart (1995)': 4.0, 'Toy Story 2 (1999)': 3.0, 'Finding Nemo (2003)': 4.5, '10 Things I Hate About You (1999)': 3.5, 'Groundhog Day (1993)': 2.5, 'Stand by Me (1986)': 4.0, 'Christmas Story, A (198

In [15]:
print(prefs['671'])

{'Waking Ned Devine (a.k.a. Waking Ned) (1998)': 4.0, 'Star Wars: Episode IV - A New Hope (1977)': 5.0, 'Wizard of Oz, The (1939)': 4.0, 'Toy Story (1995)': 5.0, 'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)': 3.5, 'JFK (1991)': 4.0, 'Terminator 2: Judgment Day (1991)': 5.0, 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)': 5.0, "Bug's Life, A (1998)": 4.0, 'Seabiscuit (2003)': 3.5, 'Matrix Reloaded, The (2003)': 4.0, "City Slickers II: The Legend of Curly's Gold (1994)": 2.5, 'Matrix, The (1999)': 4.5, 'Lord of the Rings: The Fellowship of the Ring, The (2001)': 5.0, 'Whale Rider (2002)': 2.5, 'Office Space (1999)': 3.5, 'Hunt for Red October, The (1990)': 4.0, 'Shakespeare in Love (1998)': 4.0, 'Magnolia (1999)': 3.0, "Amelie (Fabuleux destin d'Am\xc3\xa9lie Poulain, Le) (2001)": 4.5, 'Dead Man Walking (1995)': 4.0, 'Animal House (1978)': 4.0, 'Good Will Hunting (1997)': 4.0, 'Amadeus (1984)': 4.0, 'Green Mile, The (1999)': 4.0, 'Fugitive, T