## Loading Dependencies

In [1]:
#Reading all the required libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("movie_dataset.csv")

In [3]:
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
#Checking the shape of the dataset
data.shape

(4803, 24)

In [5]:
#Checking for the null values
data.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [6]:
data = data[~data['release_date'].isnull()]

In [7]:
data["release_year"] = data["release_date"].apply(lambda x : int(x.split("-")[0]))

In [8]:
def get_title_from_index(data , index):
    return data[data.index == index]["title"].values[0]

def get_index_from_title(data , title):
    return data[data.title == title]["index"].values[0]

def get_year_from_index(data , index):
    return data[data.index == index]["release_year"].values[0]

def get_year_from_title(data , title):
    return data[data.title == title]["release_year"].values[0]

In [9]:
features = ['cast','genres','director']

for feature in features:
    data[feature] = data[feature].fillna('')

In [10]:
def combine_features(row):
    try:
        return row["cast"]+" "+row["genres"]+" "+row["director"]
    except:
        pass

data["combined_features"] = data.apply(combine_features,axis=1)

In [11]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(data["combined_features"])

In [12]:
count_matrix.toarray().shape

(4802, 10959)

In [13]:
%time cosine_sim = cosine_similarity(count_matrix) 

Wall time: 1.09 s


In [14]:
movie_user_likes = "Interstellar"

movie_index = get_index_from_title(data , movie_user_likes)

similar_movies =  list(enumerate(cosine_sim[movie_index]))

In [15]:
similar_movies

[(0, 0.18190171877724973),
 (1, 0.06063390625908324),
 (2, 0.06454972243679027),
 (3, 0.4375),
 (4, 0.18190171877724973),
 (5, 0.0625),
 (6, 0.0),
 (7, 0.1720618004029213),
 (8, 0.12909944487358055),
 (9, 0.12909944487358055),
 (10, 0.18190171877724973),
 (11, 0.0625),
 (12, 0.0625),
 (13, 0.06063390625908324),
 (14, 0.24253562503633297),
 (15, 0.06454972243679027),
 (16, 0.1720618004029213),
 (17, 0.0625),
 (18, 0.18190171877724973),
 (19, 0.06454972243679027),
 (20, 0.06454972243679027),
 (21, 0.06454972243679027),
 (22, 0.0668153104781061),
 (23, 0.06454972243679027),
 (24, 0.12909944487358055),
 (25, 0.06454972243679027),
 (26, 0.1720618004029213),
 (27, 0.18190171877724973),
 (28, 0.1767766952966369),
 (29, 0.06454972243679027),
 (30, 0.06454972243679027),
 (31, 0.18190171877724973),
 (32, 0.1875),
 (33, 0.18190171877724973),
 (34, 0.0),
 (35, 0.25),
 (36, 0.25),
 (37, 0.06454972243679027),
 (38, 0.06454972243679027),
 (39, 0.1875),
 (40, 0.16770509831248423),
 (41, 0.181901718777