In [2]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
ratings.drop(['timestamp'], axis=1, inplace=True)

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,6,2.0
1,1,22,3.0
2,1,32,2.0
3,1,50,5.0
4,1,110,4.0


In [6]:
def replace_name(x):
    return movies[movies['movieId']==x].title.values[0]

ratings.movieId = ratings.movieId.map(replace_name)

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,Heat (1995),2.0
1,1,Copycat (1995),3.0
2,1,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),2.0
3,1,"Usual Suspects, The (1995)",5.0
4,1,Braveheart (1995),4.0


In [8]:
M = ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating')

In [9]:
M.shape

(706, 8551)

In [10]:
M

movieId,"""Great Performances"" Cats (1998)",'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),10 Items or Less (2006),10 Things I Hate About You (1999),...,Zorba the Greek (Alexis Zorbas) (1964),Zulu (1964),Zulu Dawn (1979),[REC] (2007),[REC]² (2009),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,1.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


If you remember from stats 101, pearson's r is:

$$r =  \frac{\sum {(x-\overline{x})(y-\overline{y})}}{\sigma_x \sigma_y}$$

and returns a value of 1 for strongly corellated data, -1 for strongly negatively correlated, and 0 for no correlation

In [11]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [12]:
pearson(M['\'burbs, The (1989)'], M['10 Things I Hate About You (1999)'])

0.22416722776742143

In [13]:
pearson(M['Harry Potter and the Sorcerer\'s Stone (a.k.a. Harry Potter and the Philosopher\'s Stone) (2001)'], 
        M['Harry Potter and the Half-Blood Prince (2009)'])

0.26225756321650101

In [14]:
pearson(M['Mission: Impossible II (2000)'], M['Erin Brockovich (2000)'])

-0.026284040284270269

In [15]:
pearson(M['Clerks (1994)'],M['Mallrats (1995)'] )

0.31399102539078944

In [16]:
def get_recs(movie_name, M, num):

    import numpy as np
    
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key=lambda tup: tup[1], reverse=True)
    return reviews[:num]

    

In [17]:
recs = get_recs('Clerks (1994)', M, 10)

In [18]:
recs[:10]

[('Jay and Silent Bob Strike Back (2001)', 0.34718831440868414),
 ('Mallrats (1995)', 0.31399102539078944),
 ('Pink Flamingos (1972)', 0.30348675721906476),
 ('Incredibly True Adventure of Two Girls in Love, The (1995)',
  0.27353105716343368),
 ('Single White Female (1992)', 0.27316404890607199),
 ('Heavy Metal (1981)', 0.26224404141430091),
 ('American Pimp (1999)', 0.26135184335341582),
 ('Dazed and Confused (1993)', 0.26008403861889751),
 ('Chaplin (1992)', 0.25788755384718542),
 ('Love! Valour! Compassion! (1997)', 0.25788755384718542)]

In [19]:
anti_recs = get_recs('Clerks (1994)', M, 8551)

In [20]:
anti_recs[-10:]

[('New York Ripper, The (Squartatore di New York, Lo) (1982)',
  -0.21210788326258792),
 ('City of the Living Dead (a.k.a. Gates of Hell, The) (Paura nella città dei morti viventi) (1980)',
  -0.22333717295148994),
 ('Glass Shield, The (1994)', -0.22333717295148994),
 ('Month by the Lake, A (1995)', -0.22333717295148994),
 ('Ichi the Killer (Koroshiya 1) (2001)', -0.22921833173707287),
 ('Baby Boy (2001)', -0.25788755384718542),
 ('Cosi (1996)', -0.25788755384718542),
 ('That Old Feeling (1997)', -0.25788755384718542),
 ('Milagro Beanfield War (1988)', -0.25899604930508913),
 ('Wild Reeds (Les roseaux sauvages) (1994)', -0.26354737747118984)]