In [1]:
import numpy as np
import pandas as pd

In [2]:
#!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [3]:
%cd recommendation

/tmp/working/recommendation


In [4]:
datafile = 'ml-100k/u.data'
data = pd.read_csv(datafile, sep='\t', header=None,
                  names=['userid', 'itemid', 'rating', 'timestamp'])

In [5]:
data.head()

Unnamed: 0,userid,itemid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
movieinfofile='ml-100k/u.item'


In [7]:
movieinfo = pd.read_csv(movieinfofile, sep='|', header = None, index_col=False,
                   names=['itemid', 'title'], usecols=[0,1], encoding= "ISO-8859-1")

In [8]:
movieinfo.head()

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [9]:
data = pd.merge(data, movieinfo, left_on='itemid', right_on='itemid')
data.head()

Unnamed: 0,userid,itemid,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [10]:
userids = data.userid
userids2 = data[['userid']]

In [11]:
userids.head()

0    196
1     63
2    226
3    154
4    306
Name: userid, dtype: int64

In [12]:
userids2.head()

Unnamed: 0,userid
0,196
1,63
2,226
3,154
4,306


In [13]:
data.loc[0:10, ['userid']]

Unnamed: 0,userid
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [14]:
toystoryusers = data[data.title == 'Toy Story (1995)']
toystoryusers.head()

Unnamed: 0,userid,itemid,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)


In [15]:
data = pd.DataFrame.sort_values(data, ['userid', 'itemid'], ascending=[0,1])

data.head()

Unnamed: 0,userid,itemid,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [16]:
max(data.userid)

943

In [17]:
max(data.itemid)

1682

In [18]:
moviesperuser = data.userid.value_counts()
userspermovie = data.title.value_counts()
userspermovie

Star Wars (1977)                                             583
Contact (1997)                                               509
Fargo (1996)                                                 508
Return of the Jedi (1983)                                    507
Liar Liar (1997)                                             485
English Patient, The (1996)                                  481
Scream (1996)                                                478
Toy Story (1995)                                             452
Air Force One (1997)                                         431
Independence Day (ID4) (1996)                                429
Raiders of the Lost Ark (1981)                               420
Godfather, The (1972)                                        413
Pulp Fiction (1994)                                          394
Twelve Monkeys (1995)                                        392
Silence of the Lambs, The (1991)                             390
Jerry Maguire (1996)     

In [19]:
def favoritemovies(activeuser, N):
    topmovies = pd.DataFrame.sort_values(data[data.userid == activeuser], 
                                        ['rating'], ascending=[0])[:N]
    return list(topmovies.title)

In [20]:
favoritemovies(5, 3)

['Men in Black (1997)',
 'Blade Runner (1982)',
 'Empire Strikes Back, The (1980)']

In [77]:
useritemmatrix = pd.pivot_table(data, values='rating', index = ['userid'],
                               columns= 'itemid')
useritemmatrix.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [48]:
from scipy.spatial.distance import correlation
def similarity(user1, user2):
    user1 = np.array(user1)-np.nanmean(user1)
    user2 = np.array(user2)-np.nanmean(user2)
    commonitemids = [ i for i in range(len(user1)) if user1[i] > 0 and user2[i] > 0]
    if len(commonitemids) == 0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonitemids])
        user2=np.array([user2[i] for i in commonitemids])
        return correlation(user1, user2)    

In [49]:
def nearestneighborratings(activeuser, k):
    similaritymatrix=pd.DataFrame(index=useritemmatrix.index, columns=['similarity'])
    for i in useritemmatrix.index:
        similaritymatrix.loc[i] = similarity(useritemmatrix.loc[activeuser],
                                            useritemmatrix.loc[i])
    similaritymatrix=pd.DataFrame.sort_values(similaritymatrix, ['similarity'], ascending=[0])
    nearestneighbors = similaritymatrix[:k]
    neighboritemratings=useritemmatrix.loc[nearestneighbors.index]
    predicteditemrating=pd.DataFrame(index=useritemmatrix.columns, columns=['rating'])
    for i in useritemmatrix.columns:
        predictedrating = np.nanmean(useritemmatrix.loc[activeuser])
        for j in neighboritemratings.index:
            if useritemmatrix.loc[j,i] > 0:
                predictedrating += (useritemmatrix.loc[j,i]-
                                    np.nanmean(useritemmatrix.loc[j]))*nearestneighbors.loc[j, 'similarity']
        predicteditemrating.loc[i, 'rating'] = predictedrating
    return predicteditemrating           
    

In [50]:
def topnrecommendations(activeuser, N):
    predictitemrating=nearestneighborratings(activeuser, 10)
    moviesalreadywatched=list(useritemmatrix.loc[activeuser].
                              loc[useritemmatrix.loc[activeuser]>0].index)
    predictitemrating=predictitemrating.drop(moviesalreadywatched)
    toprecommendation=pd.DataFrame.sort_values(predictitemrating, ['rating'], ascending=[0])[:N]
    toprecommendationtitles = (movieinfo.loc[movieinfo.itemid.isin(toprecommendation.index)])
    return list(toprecommendationtitles.title)

In [55]:
import warnings
warnings.filterwarnings('ignore')
topnrecommendations(5, 5)

['Truth About Cats & Dogs, The (1996)',
 'Sense and Sensibility (1995)',
 'Scream (1996)',
 'L.A. Confidential (1997)',
 'First Wives Club, The (1996)']

In [56]:
(movieinfo.loc[movieinfo.itemid.isin(toprecommendation.index)])

Unnamed: 0,itemid,title
110,111,"Truth About Cats & Dogs, The (1996)"
274,275,Sense and Sensibility (1995)
287,288,Scream (1996)
301,302,L.A. Confidential (1997)
475,476,"First Wives Club, The (1996)"


In [72]:
def matrixfactorization(R, K, steps=10, gamma=0.001, lamda=0.02 ):
    N = len(R.index)
    M=len(R.columns)
    P=pd.DataFrame(np.random.rand(N,K), index=R.index)
    Q=pd.DataFrame(np.random.rand(M,K), index=R.columns)
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e=e+pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]), 2) + lamda*(pow(np.linalg.norm(P.loc[i]), 2)+pow(np.linalg.norm(Q.loc[j]), 2))
        if e<0.001:
            break
        print(step)
    return P,Q

In [74]:
N = len(useritemmatrix.index)

In [75]:
M=len(useritemmatrix.columns)

In [79]:
useritemmatrix[:10]

itemid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
8,,,,,,,3.0,,,,...,,,,,,,,,,
9,,,,,,5.0,4.0,,,,...,,,,,,,,,,
10,4.0,,,4.0,,,4.0,,4.0,,...,,,,,,,,,,


In [81]:
(P,Q) = matrixfactorization(useritemmatrix.iloc[:100, :100], K=2, gamma=0.001, lamda=0.02, steps=10)

0
1
2
3
4
5
6
7
8
9


In [86]:
toprecommendations=pd.DataFrame.sort_values(predictitemrating, ['rating'], ascending=[0])

In [91]:
toprecommendationtitles

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [None]:
(P,Q) = matrixfactorization(useritemmatrix[:100, :100], K=2, gamma=0.001, lamda=0.02, steps=10)

In [93]:
activeuser=5
predictitemrating=pd.DataFrame(np.dot(P.loc[activeuser],Q.T), index=Q.index, columns=['rating'])
toprecommendations=pd.DataFrame.sort_values(predictitemrating, ['rating'], ascending=[0])[:3]
toprecommendationtitles=movieinfo.loc[movieinfo.itemid.isin(toprecommendations.index)]
print(list(toprecommendationtitles.title))


['Star Wars (1977)', 'Shawshank Redemption, The (1994)', 'Fargo (1996)']


In [71]:
for i in xrange(1, 20):
    print(i)

NameError: name 'xrange' is not defined