## Writing a data source, to test the pearson and euclidean distance functions

In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

The Dataset is in the form of a dictionary of dictionary, where the key is the name of a critic, and the value is a dicitonary of the movies rated by the critic. Accessing a rating requires access to the name of the critic (key), and the value (name of the film). The first distance function used is the simple euclidean distance 

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

## Finding Similar Critics

In [4]:
def euclidean(ratings,cr1,cr2):
    si={}
    for movie in ratings[cr1]:
        if movie in ratings[cr2]:
              si[movie]=1
    if len(si)==0:
        return 0
    sum_of_sq=sum([pow(ratings[cr1][movie]-ratings[cr2][movie],2) for movie in ratings[cr1] if movie in ratings[cr2]])
    return 1/(1+sum_of_sq)
        

In [5]:
si=euclidean(critics,'Lisa Rose','Toby')

In [6]:
print si

0.222222222222


Instead of running the function for each pair of critics, I wrote a snippet to create a dictionary consisting of the key, which is the critic, and the values, which is the similarity score for w.r.t each other critic. The value of similarity of the critic w.r.t himself/herself will obviously be 1.

In [7]:
def all_sims(critics):
    sim_all={}
    for critic,rating in critics.iteritems():
        for c_in,r_in in critics.iteritems():
            if critic==c_in:
                continue
            sim_score=euclidean(critics,critic,c_in)
            sim_temp={'Critic':c_in,'Score':sim_score}
            if critic not  in sim_all:
                #sim_add={critic:sim_temp}
                sim_all[critic]=[sim_temp]
            else:
                sim_all[critic].append(sim_temp)
    return sim_all

In [8]:
sims=all_sims(critics)

In [9]:
sims

{'Claudia Puig': [{'Critic': 'Jack Matthews', 'Score': 0.18181818181818182},
  {'Critic': 'Mick LaSalle', 'Score': 0.17391304347826086},
  {'Critic': 'Lisa Rose', 'Score': 0.2857142857142857},
  {'Critic': 'Toby', 'Score': 0.23529411764705882},
  {'Critic': 'Gene Seymour', 'Score': 0.13333333333333333},
  {'Critic': 'Michael Phillips', 'Score': 0.5714285714285714}],
 'Gene Seymour': [{'Critic': 'Jack Matthews', 'Score': 0.8},
  {'Critic': 'Mick LaSalle', 'Score': 0.12903225806451613},
  {'Critic': 'Claudia Puig', 'Score': 0.13333333333333333},
  {'Critic': 'Lisa Rose', 'Score': 0.14814814814814814},
  {'Critic': 'Toby', 'Score': 0.10810810810810811},
  {'Critic': 'Michael Phillips', 'Score': 0.21052631578947367}],
 'Jack Matthews': [{'Critic': 'Mick LaSalle', 'Score': 0.13793103448275862},
  {'Critic': 'Claudia Puig', 'Score': 0.18181818181818182},
  {'Critic': 'Lisa Rose', 'Score': 0.21052631578947367},
  {'Critic': 'Toby', 'Score': 0.11764705882352941},
  {'Critic': 'Gene Seymour', '

Above is a dictionary of all critics, along with their similarity scores for each other critic. I preferred a dictionary to a DataFrame, as it makes further computations easy. 

# Pearson Correlation Distance

In [10]:
from math import sqrt
def pearson(critics,cr1,cr2):
    si={}
    for movie in critics[cr1]:
        if movie in critics[cr2]:
            si[movie]=1
    n=len(si)
    sum1=sum([critics[cr1][movie] for movie in si])
    sum2=sum([critics[cr2][movie] for movie in si])
    
    sum1_sq=sum([pow(critics[cr1][movie],2) for movie in si])
    sum2_sq=sum([pow(critics[cr2][movie],2) for movie in si])
    
    pr_cr=sum([critics[cr1][movie]*critics[cr2][movie] for movie in si])
    num=pr_cr-(sum1*sum2/n)
    den=sqrt((sum1_sq-pow(sum1,2)/n)*(sum2_sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    
    return r

In [11]:
def all_sims_pearson(critics):
    sim_all={}
    for critic,rating in critics.iteritems():
        for c_in,r_in in critics.iteritems():
            if critic==c_in:
                continue
            sim_score=pearson(critics,critic,c_in)
            sim_temp={'Critic':c_in,'Score':sim_score}
            if critic not  in sim_all:
                #sim_add={critic:sim_temp}
                sim_all[critic]=[sim_temp]
            else:
                sim_all[critic].append(sim_temp)
    return sim_all

In [12]:
all_sims_pearson(critics)

{'Claudia Puig': [{'Critic': 'Jack Matthews', 'Score': 0.02857142857142857},
  {'Critic': 'Mick LaSalle', 'Score': 0.5669467095138411},
  {'Critic': 'Lisa Rose', 'Score': 0.5669467095138396},
  {'Critic': 'Toby', 'Score': 0.8934051474415647},
  {'Critic': 'Gene Seymour', 'Score': 0.31497039417435607},
  {'Critic': 'Michael Phillips', 'Score': 1.0}],
 'Gene Seymour': [{'Critic': 'Jack Matthews', 'Score': 0.963795681875635},
  {'Critic': 'Mick LaSalle', 'Score': 0.41176470588235276},
  {'Critic': 'Claudia Puig', 'Score': 0.31497039417435607},
  {'Critic': 'Lisa Rose', 'Score': 0.39605901719066977},
  {'Critic': 'Toby', 'Score': 0.38124642583151164},
  {'Critic': 'Michael Phillips', 'Score': 0.20459830184114206}],
 'Jack Matthews': [{'Critic': 'Mick LaSalle', 'Score': 0.21128856368212925},
  {'Critic': 'Claudia Puig', 'Score': 0.02857142857142857},
  {'Critic': 'Lisa Rose', 'Score': 0.7470178808339965},
  {'Critic': 'Toby', 'Score': 0.66284898035987},
  {'Critic': 'Gene Seymour', 'Score':

The above dictionary provides the Pearson correlation coefficient between two critics. The Pearson coefficient corrects for instances such as critics who are harsher than the others, it scales ratings as a ratio rather than absolute values as in the case of a euclidean distance. The other possible measurs are Jaccard Similarities or Manhattan Distances, both of which have their own set of flaws.

# Ranking the Critics

The below rankings sort each of the similarity scores s and select the top 3 most similar critics for each particular critic. 

In [13]:
sim_all=all_sims_pearson(critics)

In [14]:
from operator import itemgetter
def most_sim(sim_all):
    ranks_top3={}
    for critic,ranking in sim_all.iteritems():
        sorted_temp=sorted(sim_all[critic], key=itemgetter('Score'), reverse=True)
        sorted_temp=sorted_temp[:3]
        ranks_top3[critic]=sorted_temp
    return ranks_top3   
   

In [15]:
top3_critics=most_sim(sim_all)

In [16]:
top3_critics

{'Claudia Puig': [{'Critic': 'Michael Phillips', 'Score': 1.0},
  {'Critic': 'Toby', 'Score': 0.8934051474415647},
  {'Critic': 'Mick LaSalle', 'Score': 0.5669467095138411}],
 'Gene Seymour': [{'Critic': 'Jack Matthews', 'Score': 0.963795681875635},
  {'Critic': 'Mick LaSalle', 'Score': 0.41176470588235276},
  {'Critic': 'Lisa Rose', 'Score': 0.39605901719066977}],
 'Jack Matthews': [{'Critic': 'Gene Seymour', 'Score': 0.963795681875635},
  {'Critic': 'Lisa Rose', 'Score': 0.7470178808339965},
  {'Critic': 'Toby', 'Score': 0.66284898035987}],
 'Lisa Rose': [{'Critic': 'Toby', 'Score': 0.9912407071619299},
  {'Critic': 'Jack Matthews', 'Score': 0.7470178808339965},
  {'Critic': 'Mick LaSalle', 'Score': 0.5940885257860044}],
 'Michael Phillips': [{'Critic': 'Claudia Puig', 'Score': 1.0},
  {'Critic': 'Lisa Rose', 'Score': 0.40451991747794525},
  {'Critic': 'Gene Seymour', 'Score': 0.20459830184114206}],
 'Mick LaSalle': [{'Critic': 'Toby', 'Score': 0.9244734516419049},
  {'Critic': 'Lisa

# Making recommendations 

### Recommending other critics

In [31]:
def recommend(critics,critic):
    totals={}
    sim_sums={}
    for cr_iter in critics:
        if cr_iter==critic:
            continue
        sim=pearson(critics,critic,cr_iter)
        if sim<=0:
            continue
        for movie in critics[cr_iter]:
            if movie not in critics[cr_iter] or critics[cr_iter][movie]==0:
                print critics[cr_iter][movie]

In [32]:
recommend(critics,'Toby')