In [5]:
import numpy as np 
import pandas as pd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies.shape)
print(ratings.shape)

(10329, 3)
(105339, 4)


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

# Content based recommendation ( similarity on genres)

In [7]:
def remove_danda(text):
    return text.replace("|", " ")

In [8]:
movies["genres"] = movies["genres"].apply(remove_danda)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [11]:
vector = cv.fit_transform(movies['genres']).toarray()

In [12]:
vector[0]

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0], dtype=int64)

In [13]:
vector.shape

(10329, 23)

In [15]:
len(cv.get_feature_names_out())

23

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
similarity = cosine_similarity(vector)

In [23]:
similarity[0]

array([1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.        ,
       0.        ])

In [26]:
list(enumerate(similarity[0]))

[(0, 0.9999999999999999),
 (1, 0.7745966692414835),
 (2, 0.3162277660168379),
 (3, 0.25819888974716115),
 (4, 0.4472135954999579),
 (5, 0.0),
 (6, 0.3162277660168379),
 (7, 0.6324555320336758),
 (8, 0.0),
 (9, 0.25819888974716115),
 (10, 0.25819888974716115),
 (11, 0.3162277660168379),
 (12, 0.7745966692414835),
 (13, 0.0),
 (14, 0.25819888974716115),
 (15, 0.0),
 (16, 0.0),
 (17, 0.4472135954999579),
 (18, 0.4472135954999579),
 (19, 0.19999999999999998),
 (20, 0.25819888974716115),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.3162277660168379),
 (27, 0.0),
 (28, 0.36514837167011077),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.25819888974716115),
 (33, 0.3162277660168379),
 (34, 0.0),
 (35, 0.0),
 (36, 0.3162277660168379),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.5163977794943223),
 (42, 0.25819888974716115),
 (43, 0.0),
 (44, 0.0),
 (45, 0.39999999999999997),
 (46, 0.0),
 (47, 0.0),
 (48, 0.25819888974716115),
 (49, 0.3162277660168379),
 (50, 0.6

In [39]:
sorted(list(enumerate(similarity[0])),reverse=True,key = lambda x: x[1])

[(0, 0.9999999999999999),
 (1815, 0.9999999999999999),
 (2496, 0.9999999999999999),
 (2967, 0.9999999999999999),
 (3166, 0.9999999999999999),
 (3811, 0.9999999999999999),
 (6617, 0.9999999999999999),
 (6997, 0.9999999999999999),
 (7382, 0.9999999999999999),
 (7987, 0.9999999999999999),
 (9215, 0.9999999999999999),
 (9732, 0.9999999999999999),
 (10052, 0.9999999999999999),
 (3379, 0.9128709291752769),
 (6718, 0.9128709291752769),
 (7091, 0.9128709291752769),
 (7324, 0.9128709291752769),
 (8599, 0.9128709291752769),
 (8606, 0.9128709291752769),
 (8878, 0.9128709291752769),
 (9896, 0.9128709291752769),
 (1436, 0.8944271909999159),
 (1595, 0.8944271909999159),
 (1675, 0.8944271909999159),
 (1697, 0.8944271909999159),
 (1866, 0.8944271909999159),
 (2696, 0.8944271909999159),
 (3420, 0.8944271909999159),
 (3535, 0.8944271909999159),
 (3882, 0.8944271909999159),
 (4032, 0.8944271909999159),
 (4175, 0.8944271909999159),
 (4314, 0.8944271909999159),
 (4704, 0.8944271909999159),
 (4770, 0.894427

In [40]:
sorted(list(enumerate(similarity[0])),reverse=True,key = lambda x: x[1])[1:6]

[(1815, 0.9999999999999999),
 (2496, 0.9999999999999999),
 (2967, 0.9999999999999999),
 (3166, 0.9999999999999999),
 (3811, 0.9999999999999999)]

In [24]:
len(similarity[0])

10329

In [18]:
similarity.shape

(10329, 10329)

In [22]:
movies[movies['title'] == 'Toy Story (1995)'].index[0]

0

In [42]:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(movies.iloc[i[0]].title)

In [29]:
movies.iloc[[0]]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy


In [30]:
movies.iloc[0]

movieId                                              1
title                                 Toy Story (1995)
genres     Adventure Animation Children Comedy Fantasy
Name: 0, dtype: object

In [37]:
movies.iloc[0,:]

movieId                                              1
title                                 Toy Story (1995)
genres     Adventure Animation Children Comedy Fantasy
Name: 0, dtype: object

In [43]:
recommend('Toy Story (1995)')

Antz (1998)
Toy Story 2 (1999)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)
Monsters, Inc. (2001)


# Colaborative Filtering

In [45]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [46]:
ratings['userId'].value_counts()

668    5678
575    2837
458    2086
232    1421
310    1287
       ... 
58       20
51       20
288      20
388      20
257      20
Name: userId, Length: 668, dtype: int64

In [48]:
# Lets store users who had at least rated more than 200 movies
x = ratings['userId'].value_counts() > 200

In [49]:
x[x].shape

(132,)

In [51]:
y= x[x].index
y

Int64Index([668, 575, 458, 232, 310, 475, 128, 224, 607,  63,
            ...
            574, 150,  44, 528, 601, 628, 578, 580,  60,  22],
           dtype='int64', length=132)

In [52]:
ratings = ratings[ratings['userId'].isin(y)]
ratings.shape

(68646, 4)

In [53]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
1640,22,3,2.5,1111573615
1641,22,5,2.5,1111573645
1642,22,25,3.5,1111573597
1643,22,32,2.5,1111575931
1644,22,36,4.5,1111573574


In [54]:
ratings_with_movies = ratings.merge(movies, on = "movieId")
print(ratings_with_movies.shape)
ratings_with_movies.head()

(68646, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,22,3,2.5,1111573615,Grumpier Old Men (1995),Comedy Romance
1,32,3,4.0,904093241,Grumpier Old Men (1995),Comedy Romance
2,62,3,1.0,1267564348,Grumpier Old Men (1995),Comedy Romance
3,110,3,4.5,1077955601,Grumpier Old Men (1995),Comedy Romance
4,113,3,3.0,1209272944,Grumpier Old Men (1995),Comedy Romance


In [55]:
number_rating = ratings_with_movies.groupby('title')['rating'].count().reset_index()
number_rating.head()

Unnamed: 0,title,rating
0,'71 (2014),1
1,'Round Midnight (1986),1
2,'Til There Was You (1997),2
3,"'burbs, The (1989)",14
4,'night Mother (1986),1


In [57]:
number_rating.rename(columns={'rating':'num_of_rating'},inplace=True)
number_rating.head()

Unnamed: 0,title,num_of_rating
0,'71 (2014),1
1,'Round Midnight (1986),1
2,'Til There Was You (1997),2
3,"'burbs, The (1989)",14
4,'night Mother (1986),1


In [58]:
final_rating = ratings_with_movies.merge(number_rating, on='title')
print(final_rating.shape)
final_rating.head()

(68646, 7)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,num_of_rating
0,22,3,2.5,1111573615,Grumpier Old Men (1995),Comedy Romance,21
1,32,3,4.0,904093241,Grumpier Old Men (1995),Comedy Romance,21
2,62,3,1.0,1267564348,Grumpier Old Men (1995),Comedy Romance,21
3,110,3,4.5,1077955601,Grumpier Old Men (1995),Comedy Romance,21
4,113,3,3.0,1209272944,Grumpier Old Men (1995),Comedy Romance,21


In [59]:
# Lets take those movies which got at least 50 rating of user

final_rating = final_rating[final_rating['num_of_rating'] >= 50]
final_rating.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,num_of_rating
16785,399,589,5.0,945152969,Terminator 2: Judgment Day (1991),Action Sci-Fi,99
35306,403,1573,3.5,1222682314,Face/Off (1997),Action Crime Drama Thriller,63
19032,583,1370,5.0,978506138,Die Hard 2 (1990),Action Adventure Thriller,61
19010,402,1370,2.5,1172356511,Die Hard 2 (1990),Action Adventure Thriller,61
11831,63,1610,4.0,1299619302,"Hunt for Red October, The (1990)",Action Adventure Thriller,75


In [63]:
# Lets create a pivot table
movie_pivot = final_rating.pivot_table(columns='userId', index='title', values= 'rating')
movie_pivot

userId,22,24,29,32,38,44,54,60,62,63,...,607,615,622,627,628,650,659,665,666,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),,,4.0,4.0,2.5,,,,,,...,5.0,5.0,,,5.0,,,,,3.0
"Abyss, The (1989)",,4.0,4.0,,,,,,,,...,5.0,,,3.5,,,,3.5,4.0,3.0
Ace Ventura: Pet Detective (1994),,,3.0,,,,2.5,,0.5,3.5,...,3.0,,,3.0,,5.0,,3.0,,
Air Force One (1997),,,,4.0,,,3.0,,,4.0,...,3.0,,,2.5,,5.0,4.0,,,2.0
Airplane! (1980),,,,,,,,3.0,2.5,3.5,...,3.0,,,4.0,,,,,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Who Framed Roger Rabbit? (1988),,,,,,,,,2.0,3.5,...,5.0,4.0,,4.0,3.5,5.0,,,3.0,2.0
Willy Wonka & the Chocolate Factory (1971),,,,4.0,3.5,,,4.0,1.5,,...,3.0,,,,4.0,5.0,4.5,4.0,,
"Wizard of Oz, The (1939)",2.0,,,5.0,3.5,,,,5.0,,...,3.0,4.0,,,,5.0,,,3.0,5.0
X-Men (2000),1.5,,,,3.5,,3.0,,3.0,3.5,...,5.0,,,4.0,3.0,,3.5,,4.0,3.0


In [64]:
movie_pivot.shape

(191, 132)

In [65]:
 movie_pivot.fillna(0, inplace=True)

In [66]:
from scipy.sparse import csr_matrix
movie_sparse = csr_matrix(movie_pivot)
type(movie_sparse)

scipy.sparse._csr.csr_matrix

In [67]:
# Now import our clustering algoritm which is Nearest Neighbors this is an unsupervised ml algo
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm= 'brute')

In [69]:
model.fit(movie_sparse)

In [70]:
distance, suggestion = model.kneighbors(movie_pivot.iloc[55,:].values.reshape(1,-1), n_neighbors=6 )

In [71]:
distance

array([[ 0.        , 24.58149711, 25.53429067, 25.77304794, 25.98076211,
        26.11034278]])

In [72]:
suggestion

array([[ 55, 115, 153,  23, 155,  41]], dtype=int64)

In [73]:
for i in range(len(suggestion)):
    print(movie_pivot.index[suggestion[i]])

Index(['Donnie Darko (2001)', 'Minority Report (2002)',
       'South Park: Bigger, Longer and Uncut (1999)', 'Batman Begins (2005)',
       'Spider-Man (2002)', 'Cast Away (2000)'],
      dtype='object', name='title')


In [74]:
movie_pivot.index[55]

'Donnie Darko (2001)'

In [77]:
#keeping movie name
movie_names = movie_pivot.index
movie_names[55]

'Donnie Darko (2001)'

In [79]:
np.where(movie_pivot.index == 'Donnie Darko (2001)')[0][0]

55

In [83]:
def recommend_movie(movie_name):
    movie_id = np.where(movie_pivot.index == movie_name)[0][0]
    distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=6 )
    
    for i in range(len(suggestion)):
            movies = movie_pivot.index[suggestion[i]]
            for j in movies:
                if j == movie_name:
                    print(f"You searched '{movie_name}'\n")
                    print("The suggestion movies are: \n")
                else:
                    print(j)

In [84]:
movie_name = "Donnie Darko (2001)"
recommend_movie(movie_name)

You searched 'Donnie Darko (2001)'

The suggestion movies are: 

Minority Report (2002)
South Park: Bigger, Longer and Uncut (1999)
Batman Begins (2005)
Spider-Man (2002)
Cast Away (2000)
