In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def combined_feature(row):
    return row['genres'] + " " + row['keywords'] + " "+ row['tagline'] + '  ' +row['overview']+ ' ' + row['cast'] 

In [3]:
df  = pd.read_csv('movie_dataset.csv')

In [4]:
features = ['keywords','cast', 'tagline', 'overview', 'genres' , 'director']

In [5]:
for feature in features:
    df[feature] = df[feature].fillna('')

In [6]:
df['combined_feature'] = df.apply(combined_feature,axis = 1)

In [24]:
df['combined_feature']

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai  When ambitious New...
4802    Documentary obsession camcorder crush dream gi...
Name: combined_feature, Length: 4803, dtype: object

In [8]:
cv = CountVectorizer()

In [9]:
count_matrix = cv.fit_transform(df["combined_feature"])

In [10]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
cosine_sim = cosine_similarity(count_matrix)

In [12]:
def get_index_from_title(title):
    return df.index[df.title == title][0]

In [13]:
cosine_sim = cosine_similarity(count_matrix)

In [14]:
cosine_sim

array([[1.        , 0.36779872, 0.23758222, ..., 0.29902518, 0.22063065,
        0.12766554],
       [0.36779872, 1.        , 0.33958109, ..., 0.397351  , 0.29783232,
        0.17625422],
       [0.23758222, 0.33958109, 1.        , ..., 0.24835223, 0.18637483,
        0.16609096],
       ...,
       [0.29902518, 0.397351  , 0.24835223, ..., 1.        , 0.28484065,
        0.17451526],
       [0.22063065, 0.29783232, 0.18637483, ..., 0.28484065, 1.        ,
        0.21850711],
       [0.12766554, 0.17625422, 0.16609096, ..., 0.17451526, 0.21850711,
        1.        ]])

In [15]:
movie_user_likes = "Avatar"

In [16]:
movie_index  = get_index_from_title(movie_user_likes)

In [17]:
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [18]:
similar_movies

[(0, 1.0),
 (1, 0.36779872335409336),
 (2, 0.23758222260935266),
 (3, 0.30618621784789724),
 (4, 0.380794872403089),
 (5, 0.24896764872145558),
 (6, 0.25301215685249495),
 (7, 0.3907775864772995),
 (8, 0.15875015875023815),
 (9, 0.21751571003527428),
 (10, 0.3189075241015354),
 (11, 0.26796653928517),
 (12, 0.16237976320958228),
 (13, 0.29928258167535826),
 (14, 0.3276927682076163),
 (15, 0.36178730264621084),
 (16, 0.33170173600188874),
 (17, 0.2728525781658746),
 (18, 0.3160485832093548),
 (19, 0.3739507015890396),
 (20, 0.24734236946819071),
 (21, 0.27166196497538203),
 (22, 0.27219981986673664),
 (23, 0.2744106499742259),
 (24, 0.26227487442776054),
 (25, 0.30061412551478084),
 (26, 0.3572172541558802),
 (27, 0.4133635575290465),
 (28, 0.2063752063753096),
 (29, 0.32225169331774484),
 (30, 0.27500954910846337),
 (31, 0.34467938625677175),
 (32, 0.34296230297795194),
 (33, 0.25055741429289785),
 (34, 0.15453063444227516),
 (35, 0.3100868364730212),
 (36, 0.38420382304156153),
 (37, 

In [19]:
sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1], reverse = True)

In [20]:
sorted_similar_movies

[(0, 1.0),
 (300, 0.440225453162812),
 (1214, 0.43804480458633743),
 (342, 0.43256281844410943),
 (150, 0.4282250998144681),
 (420, 0.4281744192888376),
 (3159, 0.4264301495131332),
 (111, 0.4254612640817522),
 (549, 0.42500252677639816),
 (85, 0.42275215008732747),
 (3185, 0.4212718577901197),
 (1532, 0.4195731958391368),
 (1985, 0.41796020285059776),
 (59, 0.41604179849863976),
 (1760, 0.41542523394709663),
 (48, 0.4148735928439117),
 (27, 0.4133635575290465),
 (2697, 0.41244501099755615),
 (1295, 0.41164990606253066),
 (311, 0.4112117362436776),
 (1276, 0.41099746826339323),
 (1960, 0.4106181579455729),
 (329, 0.4099561249575909),
 (1915, 0.4089564406759849),
 (1472, 0.4084608644384157),
 (3669, 0.40824829046386296),
 (3193, 0.4079362455581727),
 (847, 0.4073065399812784),
 (461, 0.40575133560034454),
 (3433, 0.4048191726900387),
 (156, 0.40411192956024355),
 (301, 0.40343576522993924),
 (4277, 0.4033736934140122),
 (239, 0.4029954131628585),
 (3900, 0.4027288109977578),
 (3777, 0.4

In [21]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

In [22]:
i  = 0

In [23]:
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    i = i+1
    if( i > 15):
        break

Avatar
Starship Troopers
Aliens vs Predator: Requiem
Men in Black
Men in Black II
Hellboy II: The Golden Army
Alien
Transformers
Sphere
Captain America: The Winter Soldier
The Ice Pirates
Moonraker
The Thief and the Cobbler
2012
The Right Stuff
Jack the Giant Slayer
