In [29]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise  import cosine_similarity

# Data Collection and Pre-Processing

In [30]:
data = pd.read_csv("movies.csv")
data.shape

(4803, 24)

In [31]:
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [32]:
features = ["genres", "keywords", "tagline", "cast", "director"]

for feature in features:
    data[feature] = data[feature].fillna('')

combined_features = data['genres']+' '+data['keywords']+' '+data['tagline']+' '+data['cast']+' '+data['director']

In [33]:
combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [34]:
vectorizer = TfidfVectorizer()

feature_vectors = vectorizer.fit_transform(combined_features)


# Cosine Similarity

In [35]:
similarity = cosine_similarity(feature_vectors)
similarity

array([[1.        , 0.07219487, 0.037733  , ..., 0.        , 0.        ,
        0.        ],
       [0.07219487, 1.        , 0.03281499, ..., 0.03575545, 0.        ,
        0.        ],
       [0.037733  , 0.03281499, 1.        , ..., 0.        , 0.05389661,
        0.        ],
       ...,
       [0.        , 0.03575545, 0.        , ..., 1.        , 0.        ,
        0.02651502],
       [0.        , 0.        , 0.05389661, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02651502, 0.        ,
        1.        ]])

In [36]:
movie_name = input('Enter your favourite movie: ')

In [37]:
list_of_all_movies = data['title'].to_list()
find_close_match = difflib.get_close_matches(movie_name, list_of_all_movies)
print(find_close_match)

['The Conjuring', 'The Conjuring 2']


In [38]:
close_match = find_close_match[0]
print(close_match)

The Conjuring


In [39]:
index_of_movie = data[data.title == close_match]['index'].values[0]
index_of_movie


2096

In [40]:
similarity_score = list(enumerate(similarity[index_of_movie]))
len(similarity_score)

4803

In [41]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(2096, 1.0), (1160, 0.3490575448788334), (3270, 0.21622445704564247), (4178, 0.18182048392695896), (1590, 0.1672842780870894), (3583, 0.15891062169318193), (776, 0.1569923804754999), (451, 0.15274943900571658), (4415, 0.15240489508130894), (2570, 0.1494639865389323), (2004, 0.14417271985209257), (3163, 0.14321758433133192), (2904, 0.1424042058101468), (310, 0.14090782675789132), (3704, 0.14072108327000096), (1745, 0.14066788229516902), (1565, 0.1405625852982368), (1291, 0.1377628595857425), (4707, 0.13628481472776485), (3937, 0.13575183131022264), (1394, 0.13418099808952372), (1901, 0.13180421080567384), (4684, 0.12814601769065195), (4717, 0.12453870708289087), (3629, 0.12321950430105154), (2539, 0.12269686702549287), (2493, 0.12234242645271273), (1560, 0.12222545051751653), (2302, 0.1214808178094977), (841, 0.1207222493527451), (4076, 0.12002714845425365), (1158, 0.11974841141179908), (4224, 0.11895760065507043), (979, 0.11891833145260822), (1452, 0.11888565910746637), (1543, 0.11864

In [42]:
print("Movies suggested for you: \n")

i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = data[data.index == index]['title'].values[0]
    if (i<30):
        print(i, '.',title_from_index)
        i+=1


Movies suggested for you: 

1 . The Conjuring
2 . The Conjuring 2
3 . Howards End
4 . Higher Ground
5 . Atonement
6 . Our Idiot Brother
7 . Winter's Tale
8 . The Haunting
9 . Snow White: A Deadly Summer
10 . Ramona and Beezus
11 . Marvin's Room
12 . Detention
13 . Heaven is for Real
14 . In the Heart of the Sea
15 . Salvador
16 . My Sister's Keeper
17 . Up in the Air
18 . The Time Traveler's Wife
19 . When the Lights Went Out
20 . Happiness
21 . Donnie Brasco
22 . The Greatest Game Ever Played
23 . American Beast
24 . Your Sister's Sister
25 . Bottle Shock
26 . Snitch
27 . The Immigrant
28 . 27 Dresses
29 . Won't Back Down


In [None]:
import streamlit as st

movie = st.text_area("Enter a movie")