# Collaborative Filtering Recommender Systems

In [2]:
import numpy as np 
import pandas as pd 

In [3]:
data=pd.read_csv('u.data',sep="\t",header=None,names=['userId','itemId','rating','timestamp'])

In [4]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
data.shape

(100000, 4)

In [6]:
movieInfo=pd.read_csv('u.item',sep="|",encoding ="ISO-8859-1",header=None, index_col=False,
                     names=['itemId','title'], usecols=[0,1])
movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
movieInfo.shape

(1682, 2)

In [8]:
data=pd.merge(data,movieInfo,left_on='itemId',right_on="itemId")

In [9]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [10]:
toyStoryUsers=data[data.title=="Toy Story (1995)"]
toyStoryUsers

Unnamed: 0,userId,itemId,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)
3402,5,1,4,875635748,Toy Story (1995)
3403,109,1,4,880563619,Toy Story (1995)
3404,181,1,3,878962392,Toy Story (1995)
3405,95,1,5,879197329,Toy Story (1995)
3406,268,1,3,875742341,Toy Story (1995)


In [11]:
data=pd.DataFrame.sort_values(data,['userId','itemId'],ascending=[0,1])

In [12]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [13]:
numUsers=max(data.userId)
numMovies=max(data.itemId)

# WE can also see how many movies were rated by each user, and the number of users
# that rated each movie 
moviesPerUser=data.userId.value_counts()
usersPerMovie=data.title.value_counts()

In [14]:
usersPerMovie

Star Wars (1977)                                             583
Contact (1997)                                               509
Fargo (1996)                                                 508
Return of the Jedi (1983)                                    507
Liar Liar (1997)                                             485
English Patient, The (1996)                                  481
Scream (1996)                                                478
Toy Story (1995)                                             452
Air Force One (1997)                                         431
Independence Day (ID4) (1996)                                429
Raiders of the Lost Ark (1981)                               420
Godfather, The (1972)                                        413
Pulp Fiction (1994)                                          394
Twelve Monkeys (1995)                                        392
Silence of the Lambs, The (1991)                             390
Jerry Maguire (1996)     

In [15]:
moviesPerUser

405    737
655    685
13     636
450    540
276    518
416    493
537    490
303    484
234    480
393    448
181    435
279    434
429    414
846    405
7      403
94     400
682    399
308    397
92     388
293    388
222    387
201    386
59     382
435    379
378    375
880    368
417    365
896    362
592    360
796    358
      ... 
418     20
242     20
558     20
685     20
812     20
364     20
300     20
631     20
873     20
809     20
166     20
740     20
36      20
866     20
34      20
143     20
824     20
888     20
926     20
441     20
93      20
732     20
475     20
571     20
596     20
147     20
19      20
572     20
636     20
895     20
Name: userId, Length: 943, dtype: int64

In [16]:
numMovies

1682

In [17]:
numUsers

943

### Let's write a function to find the top N favorite movies of a user

In [18]:
def favoriteMovies(activeUser,N):
    topMovies=pd.DataFrame.sort_values(
        data[data.userId==activeUser],['rating'],ascending=[0])[:N]
    return list(topMovies.title)

print(favoriteMovies(5,5)) 
# Print the top 5 favorite movies of user 5

['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)']


1. subset the dataframe to have the rows corresponding to the active user

2. sort by the rating in descending order

3. pick the top N rows

4. return the title corresponding to the movies in topMovies

### Neighbour based collaborative filtering model

The idea behind neighbour based collaborative filtering model is to find the K Nearest neighbours of a user ( the one who are more similar to the user ) and  use their ratings to predict ratings of the active user for movies they haven't rated. 

In [19]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',index=['userId'], columns=['itemId'])

In [20]:
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [21]:
userItemRatingMatrix11=pd.pivot_table(data, values='rating',
                                    index=['userId'], columns=['title'])
userItemRatingMatrix11.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


### Compute similarity between 2 users

In [22]:
from scipy.spatial.distance import correlation 
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1) 
    user2=np.array(user2)-np.nanmean(user2)
    commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
    if len(commonItemIds)==0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)

Now to find the similarity between 2 users. We'll first subset each user to be represented only by the ratings for the movies the 2 users have in common .This is because if we try to find distance between 2 users and there are some Nan values in both vectors then the function we try to use correlation or cosine will fail. 

We need to find movies for which both users have non NaN ratings.If there are no movies in common then return 0.

If there are more than one movie then that both user have rated we will find similairyt between both users using correlation




### Similarity between active user and anyother user and finding nearest neighbour between active user

In [23]:
def nearestNeighbourRatings(activeUser,K):
    similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,
                                  columns=['Similarity'])
    
    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],
                                          userItemRatingMatrix.loc[i])
       
    similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,
                                              ['Similarity'],ascending=[0])
    nearestNeighbours=similarityMatrix[:K]
    neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index] 
    predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
    for i in userItemRatingMatrix.columns:
        predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
        for j in neighbourItemRatings.index:
            if userItemRatingMatrix.loc[j,i]>0:
                predictedRating += (userItemRatingMatrix.loc[j,i]
                                    -np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
        predictItemRating.loc[i,'Rating']=predictedRating
    return predictItemRating

This function will find the K Nearest neighbours of the active user, then use their ratings to predict the activeUsers ratings for other movies.

Creates an empty matrix whose row index is userIds, and the value will be similarity of that user to the active User.
This matrix is currently empty and we fill the values after computing similarity btwn each user and active user Iterate through each user id in the user item ratinf matrix index and compute similarity.

Find the similarity between user i and the active user and add it to the similarityMatrix.

Sort the similarity matrix in the descending order of similarity.

We'll now take the nearest neighbours and use their ratings to predict the active user's rating for every movie.

### Top Recommendation

In [24]:
def topNRecommendations(activeUser,N):
    predictItemRating=nearestNeighbourRatings(activeUser,N)
    # Use the 10 nearest neighbours to find the predicted ratings
    moviesAlreadyWatched=list(userItemRatingMatrix.loc[activeUser]
                              .loc[userItemRatingMatrix.loc[activeUser]>0].index)
    # find the list of items whose ratings which are not NaN
    predictItemRating=predictItemRating.drop(moviesAlreadyWatched)
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,
                                                ['Rating'],ascending=[0])[:N]
    # This will give us the list of itemIds which are the top recommendations 
    # Let's find the corresponding movie titles 
    topRecommendationTitles=(movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)])
    return list(topRecommendationTitles.title)

In [45]:
activeUser=11
print("Favourite Movies of Particular User")
print(favoriteMovies(5,5))

Favourite Movies of Particular User
['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)']


In [44]:
print("Recommended Movies of Particular User")
print(topNRecommendations(5,5))

Recommended Movies of Particular User


  dist = 1.0 - uv / np.sqrt(uu * vv)


['Truth About Cats & Dogs, The (1996)', 'Mirror Has Two Faces, The (1996)', 'Jerry Maguire (1996)', 'Scream (1996)', 'Matilda (1996)']


In [46]:
collab=pd.DataFrame({"Favourite_Movies":favoriteMovies(5,5),"Recommended_Movies":topNRecommendations(5,5)})
collab

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0,Favourite_Movies,Recommended_Movies
0,Men in Black (1997),"Truth About Cats & Dogs, The (1996)"
1,Blade Runner (1982),"Mirror Has Two Faces, The (1996)"
2,"Empire Strikes Back, The (1980)",Jerry Maguire (1996)
3,"Wrong Trousers, The (1993)",Scream (1996)
4,"Blues Brothers, The (1980)",Matilda (1996)


### Breakdown of above code

In [27]:
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [28]:
from scipy.spatial.distance import correlation 
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1) 
    user2=np.array(user2)-np.nanmean(user2)
    commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
    if len(commonItemIds)==0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)
print(similarity(userItemRatingMatrix.loc[5],userItemRatingMatrix.loc[11]))   

1.1348399724926483


The above code shows the correaltion between active user and some random user

### Creating an empty dataframe with user id and similarity values as nan. We can fill this nan in the next step

In [29]:
similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,columns=['Similarity'])
similarityMatrix.head()

Unnamed: 0_level_0,Similarity
userId,Unnamed: 1_level_1
1,
2,
3,
4,
5,


#### Here i gave the active user as 5 and it will compute similarity score for everyother user and display it in descending order

In [30]:
for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[5],
                                          userItemRatingMatrix.loc[i])
        similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,['Similarity'],ascending=[0])
print(similarityMatrix)

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
 

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
 

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
 

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
 

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)


       Similarity
userId           
172             2
509             2
316             2
209             2
607             2
672             2
470             2
630             2
596             2
434             2
186       1.92848
723         1.875
782       1.87039
718        1.8165
717       1.79057
208          1.75
619       1.73485
471       1.72761
30        1.70711
525       1.66667
518        1.6455
120       1.61237
426       1.59216
860       1.57735
731       1.57735
869       1.55902
55        1.55709
285       1.54772
141       1.54006
73        1.53882
...           ...
838           NaN
839           NaN
840           NaN
849           NaN
850           NaN
859           NaN
861           NaN
862           NaN
867           NaN
872           NaN
874           NaN
875           NaN
879           NaN
891           NaN
892           NaN
894           NaN
905           NaN
906           NaN
907           NaN
909           NaN
917           NaN
923           NaN
925       

#### We gave 5 nearest neighbour so it takes into consideration of 3 user id

In [31]:
nearestNeighbours=similarityMatrix[:5]
neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
neighbourItemRatings

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172,,,,,,,,,,,...,,,,,,,,,,
509,,,,,,,,,,,...,,,,,,,,,,
316,,,,,,,,,4.0,,...,,,,,,,,,,
209,5.0,,,,,,,,3.0,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,


#### Now create empty dataframe with item id and rating as nan. We will fill this shortly

In [32]:
predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
predictItemRating.head()

Unnamed: 0_level_0,Rating
itemId,Unnamed: 1_level_1
1,
2,
3,
4,
5,


#### It will iterate through every column in active user 5 and gives the mean value

In [33]:
for i in userItemRatingMatrix.columns:
        predictedRating=np.nanmean(userItemRatingMatrix.loc[5])
print(predictedRating)

2.874285714285714


In [34]:
userItemRatingMatrix.loc[316,9]

4.0

316 is the user id and 9 is the movie for which the rating is 4.0

In [35]:
np.nanmean(userItemRatingMatrix.loc[316])

3.36

In [40]:
nearestNeighbours.loc[209,'Similarity']

2.0

Locate the similarity of user 209

### Predicted Rating ( PR ) Formula
PR = Mean of active user + (Rating of particular neighbour for a particular movie - Mean of particular user)* similarity between Active user and particular nearest neigbour

In [42]:
predictedRating += (userItemRatingMatrix.loc[209,9]
                                    -np.nanmean(userItemRatingMatrix.loc[209]))*nearestNeighbours.loc[209,'Similarity']

In [43]:
predictedRating

2.510649350649351

The above step will iterate through every user for every movie in this case we choose Movie id 8 and User 209.Later it will give the predicted rating for the particular movie. 

Now we need to use this predicted rating and remove all other watched movies for the active user and then sort out the top rated movies in descending order. In this way we can get the Top recommended movies

This is how the collaborative filter works. Now we will see content based filter

# Content Based Filter

#### It takes the user prefrences like what movies we like and ask us to rate some movies and based on the information given by the user . we will now recommend the movies to the user by 2 methods.

1. Description Based Recommender
2. Genre,Cast and Crew based Recommender

## Description Based Recommender

In [36]:
import pandas as pd
import numpy as np
df = pd.read_csv('metadata_clean.csv')
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995


In [37]:
orig_df = pd.read_csv('movies_metadata.csv', low_memory=False)
df['overview'], df['id'] = orig_df['overview'], orig_df['id']
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


The goal of inverse document frequency is to account for less frequent words which are more informative and penalise the words which are more frequent and hardly convey information. 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(45466, 75827)

In [6]:
tfidf_matrix

<45466x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 1210882 stored elements in Compressed Sparse Row format>

In [38]:
df11=pd.DataFrame(tfidf_matrix)

In [39]:
df11.head(20)

Unnamed: 0,0
0,"(0, 17764)\t0.13483149538639247\n (0, 4388)..."
1,"(0, 14988)\t0.15233435064749146\n (0, 66484..."
2,"(0, 67150)\t0.09013326169468926\n (0, 30668..."
3,"(0, 8909)\t0.2524772113861553\n (0, 72417)\..."
4,"(0, 35872)\t0.18466136938635244\n (0, 27260..."
5,"(0, 71476)\t0.14820808261852306\n (0, 20957..."
6,"(0, 58416)\t0.2434561295245383\n (0, 9245)\..."
7,"(0, 21922)\t0.13684399233279837\n (0, 55900..."
8,"(0, 9879)\t0.19377340880702595\n (0, 23512)..."
9,"(0, 9112)\t0.22934215378177036\n (0, 55966)..."


In [40]:
df11.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45456,45457,45458,45459,45460,45461,45462,45463,45464,45465
0,"(0, 17764)\t0.13483149538639247\n (0, 4388)...","(0, 14988)\t0.15233435064749146\n (0, 66484...","(0, 67150)\t0.09013326169468926\n (0, 30668...","(0, 8909)\t0.2524772113861553\n (0, 72417)\...","(0, 35872)\t0.18466136938635244\n (0, 27260...","(0, 71476)\t0.14820808261852306\n (0, 20957...","(0, 58416)\t0.2434561295245383\n (0, 9245)\...","(0, 21922)\t0.13684399233279837\n (0, 55900...","(0, 9879)\t0.19377340880702595\n (0, 23512)...","(0, 9112)\t0.22934215378177036\n (0, 55966)...",...,"(0, 15006)\t0.38268724915273156\n (0, 15141...","(0, 6959)\t0.39206064971796467\n (0, 59862)...","(0, 57352)\t0.394738328013569\n (0, 3915)\t...","(0, 10060)\t0.18348394213427321\n (0, 32940...","(0, 70701)\t0.27540372405340346\n (0, 16087...","(0, 56405)\t0.6771266635239638\n (0, 22632)...","(0, 63819)\t0.474848181921328\n (0, 4251)\t...","(0, 6761)\t0.3715972018149557\n (0, 30135)\...","(0, 20273)\t0.1677505411397102\n (0, 15394)...","(0, 16520)\t0.32373307886945113\n (0, 7157)..."


In [7]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
cosine_sim

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00595453, 0.02198641, 0.01402548, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00929411, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
cosine_sim.shape

(45466, 45466)

In [27]:
df1=pd.DataFrame(cosine_sim)
df1.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45456,45457,45458,45459,45460,45461,45462,45463,45464,45465
0,1.0,0.015041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005955,0.0
1,0.015041,1.0,0.04682,0.0,0.0,0.050188,0.0,0.0,0.102532,0.0,...,0.0,0.0,0.0,0.011295,0.0,0.0,0.066873,0.0,0.021986,0.009294
2,0.0,0.04682,1.0,0.0,0.025094,0.0,0.0,0.006402,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014025,0.0
3,0.0,0.0,0.0,1.0,0.0,0.007203,0.0,0.00895,0.0,0.0,...,0.0,0.0,0.0,0.021542,0.0,0.026445,0.0,0.0,0.009522,0.016413
4,0.0,0.0,0.025094,0.0,1.0,0.0,0.030264,0.0,0.032754,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007006,0.0
5,0.0,0.050188,0.0,0.007203,0.0,1.0,0.0,0.0,0.04722,0.0,...,0.0,0.0,0.0,0.0,0.0,0.025456,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.030264,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011525,0.0
7,0.0,0.0,0.006402,0.00895,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.068368,0.0,0.015574,0.0,0.0,0.0,0.0,0.005271,0.0
8,0.0,0.102532,0.0,0.0,0.032754,0.04722,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.009572,0.0,0.0,0.0,0.038268,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.030423,0.0,0.0,0.0,0.0,0.0,0.049855,0.0,0.0,0.0


In [41]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [11]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45466, dtype: int64

In [42]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

Above function takes in movie title as input and gives recommendations.

1. Obtain the index of the movie that matches the title

2. Get the pairwsie similarity scores of all movies with that movie and convert it into a list of tuples

3. Sort the movies based on the cosine similarity scores.

4. Get the scores of the 10 most similar movies. Ignore the first movie.

5. Get the movie indices

6. Return the top 10 most similar movie


In [43]:
content_recommender('The Lion King')

34682    How the Lion Cub and the Turtle Sang a Song
9353                                The Lion King 1½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
25654                                 Fearless Fagan
17041                                   African Cats
27933              Massaï, les guerriers de la pluie
6094                                       Born Free
37409                                     Sour Grape
3203                                The Waiting Game
Name: title, dtype: object

### Breakdown of above code

In [12]:
idx = indices["Toy Story"]

In [13]:
idx

0

In [16]:
cosine_sim[idx]

array([1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
       0.        ])

In [14]:
sim_scores = list(enumerate(cosine_sim[idx]))

In [15]:
sim_scores

[(0, 0.9999999999999999),
 (1, 0.015041212918177515),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0385175177149662),
 (18, 0.0),
 (19, 0.0),
 (20, 0.009745675666045901),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.018626912291190782),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.006348755500557856),
 (43, 0.0),
 (44, 0.0),
 (45, 0.008909503207796014),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.013015526610178735),
 (50, 0.009127791929097795),
 (51, 0.010727589064134381),
 (52, 0.0),
 (53, 0.0),
 (54, 0.020101783171709495),
 (55, 0.0),
 (56, 0.025175693721243005),
 (57, 0.020768247412050172),
 (58, 0.0),
 (59, 0.033310391215322394),
 (60, 0.0),
 (61, 0.0),
 (62, 0.00763890374653748),
 (63

In [17]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [18]:
sim_scores

[(0, 0.9999999999999999),
 (15348, 0.53190918118087),
 (2997, 0.47198418619249105),
 (10301, 0.2750466635576917),
 (24523, 0.27306238874302075),
 (23843, 0.2354951321335196),
 (29202, 0.2238412542944943),
 (43427, 0.2176875375749772),
 (38476, 0.21595932807689333),
 (42721, 0.20197249497659434),
 (8327, 0.1988246580045149),
 (27206, 0.1823408627736072),
 (1071, 0.18203955618226225),
 (36094, 0.17299294129685333),
 (40261, 0.1704110225532426),
 (1932, 0.16487396614147073),
 (26304, 0.16321235898528033),
 (29369, 0.16229817818703215),
 (11399, 0.15786116759983826),
 (34589, 0.14475771180809027),
 (3057, 0.1440299458736206),
 (21359, 0.14364657307202588),
 (485, 0.13577858845811638),
 (23530, 0.13528652147453651),
 (32086, 0.13359458182229372),
 (17189, 0.13217518196922695),
 (11606, 0.13190212938581286),
 (1032, 0.1311571209760012),
 (3252, 0.13014993756305127),
 (7254, 0.12962359032245058),
 (35245, 0.12842588356096668),
 (2157, 0.1279357389483814),
 (39611, 0.1273596552134107),
 (27662

In [19]:
sim_scores = sim_scores[1:11]
sim_scores

[(15348, 0.53190918118087),
 (2997, 0.47198418619249105),
 (10301, 0.2750466635576917),
 (24523, 0.27306238874302075),
 (23843, 0.2354951321335196),
 (29202, 0.2238412542944943),
 (43427, 0.2176875375749772),
 (38476, 0.21595932807689333),
 (42721, 0.20197249497659434),
 (8327, 0.1988246580045149)]

In [20]:
for i in sim_scores:
      print(i[0])

15348
2997
10301
24523
23843
29202
43427
38476
42721
8327


In [21]:
movie_indices = [i[0] for i in sim_scores]
movie_indices

[15348, 2997, 10301, 24523, 23843, 29202, 43427, 38476, 42721, 8327]

In [22]:
df['title'].iloc[movie_indices]

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object

# Cast,Crew and Genre Based Recommender

In [44]:
cred_df = pd.read_csv('credits.csv')
key_df = pd.read_csv('keywords.csv')

In [45]:
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [46]:
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [12]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [13]:
df['id'] = df['id'].apply(clean_ids)

df = df[df['id'].notnull()]

In [14]:
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [15]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [16]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [17]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [18]:
df['director'] = df['crew'].apply(get_director)

df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

#### Now we take Genre , Director , Top 3 cast ,Keywords and  remove any space between a particular names and then join this whole part. So we use this to find the cosine similarity function

In [19]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names

    return []

In [20]:
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [21]:
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [22]:
df[['title', 'cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[adventure, fantasy, family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[romance, comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[comedy, drama, romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[comedy]


In [23]:
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [24]:
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [25]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [26]:
df['soup'] = df.apply(create_soup, axis=1)

In [27]:
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [34]:
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [37]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

29607                                          Cheburashka
40904                   VeggieTales: Josh and the Big Wall
40913    VeggieTales: Minnesota Cuke and the Search for...
27768                                 The Little Matchgirl
15209             Spiderman: The Ultimate Villain Showdown
16613                            Cirque du Soleil: Varekai
24654                                  The Seventh Brother
29198                                      Superstar Goofy
30244                                              My Love
31179                Pokémon: Arceus and the Jewel of Life
Name: title, dtype: object