In [14]:
import pandas as pd
import numpy as np
from pprint import pprint
from math import sqrt

In [15]:
column_names =['user_id','item_id','rating','timestamp']
df = pd.read_csv('./ml-100k/u.data', sep='\t',names=column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
df2 = pd.read_csv('./ml-100k/u.item', sep='\|',encoding = "ISO-8859-1")
df2.head()

  df2 = pd.read_csv('./ml-100k/u.item', sep='\|',encoding = "ISO-8859-1")


Unnamed: 0,1,Toy Story (1995),01-Jan-1995,Unnamed: 3,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0.1,0.2,1.1,1.2,...,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15
0,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#Collab Filtering Recommendation Process

In [5]:
#loadin the movie and rating 
def loadMovieLens(path='./ml-100k'):
    # Get movie titles
    movies={}
    for line in open(path+'/u.item',encoding = "ISO-8859-1"):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [6]:
#This function calculates Pearson Correlation Score
def sim_pearson(prefs,p1,p2): 
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r

In [8]:
#Returns top score
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
    # print(scores)
    # Sort the list so the highest scores appear at the top
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [9]:
#gives recommendation
def getCollabRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
        # print(totals)
        # print(simSums)
        # Create the normalized list
        rankings=[(total/simSums[item],item) for item,total in totals.items()]
        # Return the sorted list
        rankings.sort( )
        rankings.reverse( )
        return rankings

In [11]:
#load movie lens dataset
prefs=loadMovieLens()
#prefs['87']

In [17]:
#get the recommendation for the user Eg getContentBasedRecommended(loaded_dataset,'user_id')
pprint(getCollabRecommendations(prefs,'87')[0:30])

[(5.0, 'Stand by Me (1986)'),
 (5.0, 'Secrets & Lies (1996)'),
 (5.0, 'English Patient, The (1996)'),
 (5.0, 'Being There (1979)'),
 (4.0, 'Waiting for Guffman (1996)'),
 (4.0, "Muriel's Wedding (1994)"),
 (4.0, 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)'),
 (4.0, 'Kids in the Hall: Brain Candy (1996)'),
 (4.0, 'Harold and Maude (1971)'),
 (4.0, 'Fast, Cheap & Out of Control (1997)'),
 (3.0, 'Van, The (1996)'),
 (3.0, 'Shall We Dance? (1996)'),
 (3.0, "Marvin's Room (1996)"),
 (3.0, 'Kolya (1996)'),
 (3.0, 'Full Monty, The (1997)'),
 (3.0, 'Cold Comfort Farm (1995)'),
 (3.0, 'Boogie Nights (1997)'),
 (3.0, 'Beautiful Girls (1996)'),
 (2.0, 'Princess Bride, The (1987)'),
 (2.0, 'Men in Black (1997)'),
 (2.0, 'Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995)'),
 (1.0, 'Operation Dumbo Drop (1995)')]
