# IMPORTING DEPENDENCIES

In [18]:
import numpy as np
import pandas as pd
import difflib #
from sklearn.feature_extraction.text import TfidfVectorizer# convert textual data to numeric values (Feature Engineering)
from sklearn.metrics.pairwise import cosine_similarity   # Similarity score for movies for recommending

# Data collection and Pre-Processing

In [6]:
movies_data = pd.read_csv('movies.csv')
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [7]:
movies_data.shape

(4803, 24)

In [8]:
#FEATURE SELECTION - SELECTING RELEVANT FEATURES FOR RECOMMENDATION

selected_features=['genres','keywords','tagline','cast','director']

In [11]:
#Replacing NULL VALUES for NULL STRINGS
for features in selected_features:
    movies_data[features] = movies_data[features].fillna('')

In [14]:
# COMBINED DATA
combined_features = movies_data['director'] + ' ' + movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast']

In [15]:
combined_features

0       James Cameron Action Adventure Fantasy Science...
1       Gore Verbinski Adventure Fantasy Action ocean ...
2       Sam Mendes Action Adventure Crime spy based on...
3       Christopher Nolan Action Crime Drama Thriller ...
4       Andrew Stanton Action Adventure Science Fictio...
                              ...                        
4798    Robert Rodriguez Action Crime Thriller united ...
4799    Edward Burns Comedy Romance  A newlywed couple...
4800    Scott Smith Comedy Drama Romance TV Movie date...
4801    Daniel Hsia   A New Yorker in Shanghai Daniel ...
4802    Brian Herzlinger Documentary obsession camcord...
Length: 4803, dtype: object

In [46]:
# Convert the data to feature vectors (numbers)
vectorizer = TfidfVectorizer()
vectorizer

In [21]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [22]:
feature_vectors

<4803x17318 sparse matrix of type '<class 'numpy.float64'>'
	with 124266 stored elements in Compressed Sparse Row format>

## COSINE SIMILARITY

In [23]:
#Getting similarity score using cosine similarity
similarity = cosine_similarity(feature_vectors)

In [24]:
similarity

array([[1.        , 0.07219487, 0.037733  , ..., 0.        , 0.        ,
        0.        ],
       [0.07219487, 1.        , 0.03281499, ..., 0.03575545, 0.        ,
        0.        ],
       [0.037733  , 0.03281499, 1.        , ..., 0.        , 0.05389661,
        0.        ],
       ...,
       [0.        , 0.03575545, 0.        , ..., 1.        , 0.        ,
        0.02651502],
       [0.        , 0.        , 0.05389661, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02651502, 0.        ,
        1.        ]])

In [25]:
similarity.shape   #movies_index, similarity score

(4803, 4803)

### Similarity is found by comparing each movie with all other movies

# USER INPUT

In [64]:
movies_name = input('Enter your favourite Movie name: ')

Enter your favourite Movie name: Poseidon


In [65]:
# Creating a movie names list from the dataset
list_movies = movies_data['title'].tolist()
list_movies

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [66]:
# Find the closest match for the movie name

close_matches= difflib.get_close_matches(movies_name,list_movies)

In [67]:
close_matches

['Poseidon', 'The Possession']

In [68]:
close_match = close_matches[0]
close_match

'Poseidon'

#### But close match is not enough, we need better and more number of recommendations

In [69]:
# FINDING INDEX OF THE MOVIE USING TITLE
movie_index = movies_data[movies_data.title==close_match]['index'].values[0]
movie_index

104

In [70]:
# GETTING A LIST OF SIMILAR MOVIES
similarity_score = list(enumerate(similarity[movie_index]))

In [71]:
similarity_score

[(0, 0.022992680220692015),
 (1, 0.03462715696134415),
 (2, 0.014964564742040993),
 (3, 0.017241547143084804),
 (4, 0.014242311968792987),
 (5, 0.023818616580633587),
 (6, 0.025179128556487792),
 (7, 0.031480756694674546),
 (8, 0.017229040056982482),
 (9, 0.014423789881247102),
 (10, 0.01645120723710284),
 (11, 0.01822399206889758),
 (12, 0.014934187331428928),
 (13, 0.013483496510633207),
 (14, 0.014530981482126554),
 (15, 0.0237510440880424),
 (16, 0.029850449209305727),
 (17, 0.014286283909590443),
 (18, 0.029888087461742135),
 (19, 0.03495975719520473),
 (20, 0.068733840012767),
 (21, 0.03339787026039325),
 (22, 0.030708996688699225),
 (23, 0.0071895907166423766),
 (24, 0.024749433369566087),
 (25, 0.009192125066407326),
 (26, 0.01523705188328164),
 (27, 0.01868768002515106),
 (28, 0.02051686734010145),
 (29, 0.021463698221797937),
 (30, 0.03160522595853556),
 (31, 0.014373951677301183),
 (32, 0.00769238101596817),
 (33, 0.020578059955052644),
 (34, 0.0),
 (35, 0.04169587194451399)

### The above array shows the index and the similarity score for the user's movie and other movies

In [72]:
len(similarity_score)

4803

In [73]:
#SORTING THE MOVIES BASED ON THEIR SIMILARITY SCORE

sorted_movies = sorted(similarity_score, key=lambda x:x[1],reverse=True)     #x[1] here represents the score part in the array
sorted_movies

[(104, 1.0000000000000004),
 (212, 0.19683934765369426),
 (716, 0.16711706237024263),
 (1652, 0.1503603735950642),
 (3766, 0.1436969556788928),
 (2274, 0.14091597766374997),
 (387, 0.13898397559939696),
 (995, 0.13348451873880893),
 (852, 0.1328201765089543),
 (3473, 0.12949803955467973),
 (2179, 0.123795440026059),
 (3217, 0.12286556235710332),
 (214, 0.11695537135966304),
 (2001, 0.11156161206643456),
 (296, 0.11084371971521091),
 (746, 0.110120962993967),
 (4467, 0.10975209444196016),
 (929, 0.10957666375392931),
 (1365, 0.10920375440342735),
 (2860, 0.10723600531194662),
 (625, 0.10310974592865607),
 (3688, 0.1016760994839742),
 (162, 0.10131139967395637),
 (145, 0.10123903725162921),
 (1761, 0.09871198135341619),
 (448, 0.09835670673566718),
 (3779, 0.09828199310948924),
 (2476, 0.0971156104890837),
 (1918, 0.09552230845176835),
 (1657, 0.09481098508469457),
 (1798, 0.09419639364636667),
 (3896, 0.09323802404554556),
 (976, 0.09301901230688615),
 (834, 0.091926868800216),
 (102, 0

In [74]:
#PRINT THE NAME OF THE SIMILAR MOVIES
print('You have watched ---> ',movies_name,'\n')
print("Here are the moveis recommended for you: \n")
i=1

for movie in sorted_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index==index]['title'].values[0]
    if(i<11):
        print(i,'-',title_from_index)
        i+=1
    

You have watched --->  Poseidon 

Here are the moveis recommended for you: 

1 - Poseidon
2 - The Day After Tomorrow
3 - Ladder 49
4 - Dragonball Evolution
5 - Four Rooms
6 - Survivor
7 - Air Force One
8 - Beautiful Creatures
9 - The Phantom of the Opera
10 - 200 Cigarettes
