In [125]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
#READ CSV
df=pd.read_csv('movie_dataset.csv')
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [127]:
#SELECT FEATURES
features=['keywords','cast','genres','director','vote_average','vote_count']


In [128]:
#CREATE A COL IN DF WHICH COMBINES ALL SELECTED FEATURES
for feature in features:
    df[feature] = df[feature].fillna('')
    
def combine_feature(row):
    try:
        return row['keywords']+' '+row['cast']+' '+row['genres']+' '+row['director']
    except:
        print("ERROR: ",row)

df["combined_features"]=df.apply(combine_feature,axis=1)

df["combined_features"].head()

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object

In [129]:
#CREATE COUNT MATRIX FROM THIS NEW COMBINED COLUMN
cv=CountVectorizer()
count_matrix=cv.fit_transform(df['combined_features'])
count_matrix

<4803x14845 sparse matrix of type '<class 'numpy.int64'>'
	with 97547 stored elements in Compressed Sparse Row format>

In [130]:
#COMPUTER THE COSINE SIMILARITY BASED ON COUNT_MATRIX
cosine_sim=cosine_similarity(count_matrix)
cosine_sim

movie_user_likes="Avatar"

In [131]:
#HELPER FUNCTIONS
def get_index_from_title(title):
    return df[df.title==title]['index'].values[0]
def get_title_from_index(index):
    return df[df.index==index]['title'].values[0]

In [132]:
#GET INDEX OF THIS MOVE FROM ITS TITLE
movie_index=get_index_from_title(movie_user_likes)
similar_movies=cosine_sim[movie_index]
similar_movies=list(enumerate(similar_movies))

In [133]:
#GET A LIST OF SIMILAR MOVIES IN DESCENDING ORDER OF SIMILARITY SCORES
sorted_similar_movies=sorted(similar_movies,key=lambda x:x[1],reverse=True)
sorted_similar_movies

[(0, 1.0000000000000004),
 (94, 0.42339019740572564),
 (2403, 0.3774256780481986),
 (3208, 0.3464101615137755),
 (47, 0.34426518632954817),
 (56, 0.33596842045264647),
 (3158, 0.33333333333333337),
 (2198, 0.31426968052735443),
 (2696, 0.30792014356780045),
 (4401, 0.28867513459481287),
 (1531, 0.2858966759567453),
 (278, 0.2810913475705226),
 (1053, 0.2809003238667948),
 (239, 0.2765204519281134),
 (838, 0.2749859704614352),
 (61, 0.27498597046143514),
 (232, 0.2694301256218254),
 (4332, 0.2694301256218254),
 (661, 0.264197974633739),
 (4593, 0.264197974633739),
 (3730, 0.2592592592592593),
 (1650, 0.2501595914621521),
 (158, 0.24618298195866545),
 (461, 0.24618298195866545),
 (1083, 0.24618298195866542),
 (322, 0.24077170617153845),
 (228, 0.24077170617153842),
 (539, 0.24077170617153842),
 (577, 0.23570226039551587),
 (1990, 0.23570226039551587),
 (1652, 0.23570226039551584),
 (10, 0.23094010767585035),
 (400, 0.23094010767585035),
 (4, 0.23094010767585033),
 (2129, 0.23094010767585

In [134]:
len(sorted_similar_movies)


4803

In [135]:
#PRINT TITLES OF FIRST 50 MOVIE
similar_movie_titles=[]
i=0
for i in range(len(sorted_similar_movies)):
    similar_movie_titles.append(get_title_from_index(sorted_similar_movies[i][0]))
    i=i+1
    if i==50:
        break

In [136]:
similar_movie_titles

['Avatar',
 'Guardians of the Galaxy',
 'Aliens',
 'Star Wars: Clone Wars: Volume 1',
 'Star Trek Into Darkness',
 'Star Trek Beyond',
 'Alien',
 'Lockout',
 'Jason X',
 'The Helix... Loaded',
 'Moonraker',
 'Planet of the Apes',
 'Galaxy Quest',
 'Gravity',
 'Alien³',
 'Jupiter Ascending',
 'The Wolverine',
 'Silent Running',
 'Zathura: A Space Adventure',
 'Trekkies',
 'Cargo',
 'Wing Commander',
 'Star Trek',
 'Lost in Space',
 'Babylon A.D.',
 'The Fifth Element',
 'Oblivion',
 'Titan A.E.',
 'AVP: Alien vs. Predator',
 'The Empire Strikes Back',
 'Dragonball Evolution',
 'Superman Returns',
 'Divergent',
 'John Carter',
 'The Black Hole',
 'The Ice Pirates',
 'Memoirs of an Invisible Man',
 'Starship Troopers',
 "The Astronaut's Wife",
 'Machete Kills',
 'Soldier',
 'The Abyss',
 'Damnation Alley',
 'Men in Black',
 'Space Cowboys',
 'Space Dogs',
 'The Time Machine',
 'Sheena',
 'Captain America: Civil War',
 'Star Trek: Insurrection']