In [5]:
import numpy as np
import pandas as pd

#generate movie recommednation, given movie they have already watched,
#and the ratings they gave for those movies

#pandas, a data analysis library, for most of the data preparation
#and analysis. We cna read the data from a csw, write to a csv,
#manipulate it into different shapres

dataFile = '/Users/sokalong/desktop/RecSys-fyp/ml-100k/u.data'
data=pd.read_csv(dataFile, sep="\t", header=None, 
                 names=['userId', 'itemId', 'rating', 'timestamp'])






In [6]:
#data is a panadas DataFrame object. There are many complex ways of indexing this
#head() will print the first few rows in the dataframe
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
movieInfoFile  = '/Users/sokalong/desktop/RecSys-fyp/ml-100k/u.item'
moviedata = pd.read_csv(movieInfoFile,sep="|", header=None,index_col=False,names=['itemId','title'],usecols=[0,1],encoding='latin-1')
#here we are reading the movie data. We just care about the movieID
#and the title, we are only reading first two col
# ->  usecols
moviedata.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [11]:
#merge the two data
data=pd.merge(data,moviedata,left_on='itemId',right_on="itemId")
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [12]:
#all thevalues in a col can simply be indexed by the col name
userIds=data.userId #panda series object
userIds2=data[['userId']] #dataframe

In [13]:
#loc
#give it col and row index or use  boolean indexing
data.loc[0:10,['userId']] #fetching the first 11rows

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [15]:
#subset dataframe
toyStoryUsers = data[data.title=="Toy Story (1995)"]
toyStoryUsers.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)


In [22]:
#sort value in the dataframe
data=pd.DataFrame.sort_values(data,['userId','itemId'],ascending=[0,1])

numUsers=max(data.userId)
numMovies=max(data.itemId)

moviesPerUser=data.userId.value_counts()
usersPerMovie=data.title.value_counts()


In [73]:
def favoriteMovies(activeUser, N):
    #1 subset the dataframe to have the rows corresponding to the active user
    #2 sort by rating  in descending order
    
    topMovies=pd.DataFrame.sort_values(
        data[data.userId==activeUser], ['rating'],ascending=[0])[:N]
    
    return  list(topMovies.itemId)

print (favoriteMovies(5,3))

[257, 89, 172]


In [75]:
#recommednation now

#neigbour based collaborative filtering model
# K nearest neighbours
# represent user in vector 

#user-item matrix
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                   index=['userId'], columns=['itemId'])

userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [43]:
# now each user has been represented using their ratings
# compute similarity between 2 users.
# correlation

from scipy.spatial.distance import correlation
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1)  #nanmean -> calculating the  mean ignoring the NaN
    user2=np.array(user2)-np.nanmean(user2)
    
    #movie have in common
    commonItemIds = [i for i in range(len(user1))
                                      if user1[i]>0 and user2[i]>0]
    
    if len(commonItemIds)==0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)

In [56]:
#using this similarity function with all the other user, let's  find  the  nearest
#neighbours of the active user
def nearestNeighbourRatings(activeUser, K):
    similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,
                                 columns=['Similarity'])
    
    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],
                                          userItemRatingMatrix.loc[i])
        #find the similarting and store into the similarity matrix
        
    similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,['Similarity'],ascending=[0])
    #sort in descending order
    
    #Nearest K neighbours
    nearestNeighbours=similarityMatrix[:K]
    
    #now predict
    neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
    
    # a placeholder for the predicted item ratings
    predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
    
    for i in userItemRatingMatrix.columns:
        predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
        
        for j in neighbourItemRatings.index:
            if userItemRatingMatrix.loc[j,i]>0:
                
                predictedRating += (userItemRatingMatrix.loc[j,i]
                                    -np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
        predictItemRating.loc[i,'Rating']=predictedRating
    return predictItemRating
    
    
def topNRecommendation(activeUser, N):
    predictItemRating=nearestNeighbourRatings(activeUser, 10)
    movieAlreadyWatched=list(userItemRatingMatrix.loc[activeUser].loc[userItemRatingMatrix.loc[activeUser]>0].index)
    predictItemsRating=predictItemRating.drop(movieAlreadyWatched)
    print(predictItemsRating)
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,
                                               ['Rating'],ascending=[0])[:N]
    topRecommendationTitles=(moviedata.loc[moviedata.itemId.isin(topRecommendations.index)])
    return list(topRecommendationTitles.title)


In [77]:
activeUser = 12
print (favoriteMovies(activeUser, 6))
print("TOP recommednation")
print(topNRecommendation(activeUser,3))

[4, 143, 216, 204, 282, 15]
TOP recommednation


  dist = 1.0 - uv / np.sqrt(uu * vv)


         Rating
itemId         
1       4.39216
2       4.39216
3       4.39216
5       4.39216
6       4.39216
7       4.39216
8       4.39216
9       4.39216
10      4.39216
11      4.39216
12      4.39216
13      4.39216
14      4.39216
16      4.39216
17      4.39216
18      4.39216
19      4.39216
20      4.39216
21      4.39216
22      4.39216
23      4.39216
24      4.39216
25      4.39216
26      4.39216
27      4.39216
29      4.39216
30      4.39216
31      4.39216
32      4.39216
33      4.39216
...         ...
1653    4.39216
1654    4.39216
1655    4.39216
1656    4.39216
1657    4.39216
1658    4.39216
1659    4.39216
1660    4.39216
1661    4.39216
1662    4.39216
1663    4.39216
1664    4.39216
1665    4.39216
1666    4.39216
1667    4.39216
1668    4.39216
1669    4.39216
1670    4.39216
1671    4.39216
1672    4.39216
1673    4.39216
1674    4.39216
1675    4.39216
1676    4.39216
1677    4.39216
1678    4.39216
1679    4.39216
1680    4.39216
1681    4.39216
1682    

In [60]:
!jupyter nbconvert --to script recommendations1.ipynb

[NbConvertApp] Converting notebook recommendations1.ipynb to script
[NbConvertApp] Writing 5527 bytes to recommendations1.py
