## Import the Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Data Collection and Pre-Processing

In [2]:
# Loading the data from the csv file to a DataFrame

movies_data=pd.read_csv('./movies.csv', low_memory=False)

In [3]:
movies_data.tail()

Unnamed: 0,index,genres,keywords,tagline,title,cast,director
6997,6997,Thriller,Add a Plot,,Heeriye,"Shatrughan Sinha, Reena Roy, Ajit Khan, Premna...",Subhash Ghai
6998,6998,"Drama, Musical, Romance",A renowned music teacher mentors a promising y...,,Sur: The Melody of Life,"Lucky Ali, Simone Singh, Achint Kaur, Ehsan Khan",Tanuja Chandra
6999,6999,"Musical, Romance",When a ballroom dancer's shot at a crucial tou...,,Time to Dance,"Sooraj Pancholi, Isabelle Kaif, Waluscha D'Sou...",Stanley D'Costa
7000,7000,"Drama, Family, Fantasy",After the tragic deaths of his son Ajit and da...,,Nigahen: Nagina Part II,"Sunny Deol, Sridevi, Anupam Kher, Gulshan Grover",Harmesh Malhotra
7001,7001,"Action, Comedy, Drama",Raj is a successful lawyer due to constant che...,,Kyo Kii... Main Jhuth Nahin Bolta,"Govinda, Sushmita Sen, Rambha, Anupam Kher",David Dhawan


In [4]:
# Number of rows and columns in the dataframe

movies_data.shape

(7002, 7)

In [5]:
# Selecting the relevant features for recommendation

selected_features=['genres','keywords','cast','director']
print(selected_features)

['genres', 'keywords', 'cast', 'director']


In [6]:
# Replacing the null values with null string

for feature in selected_features:
    movies_data[feature]=movies_data[feature].fillna('')

In [7]:
# Combining the selected_features

combined_features= movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['cast']+' '+movies_data['director']

In [8]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
6997    Thriller Add a Plot Shatrughan Sinha, Reena Ro...
6998    Drama, Musical, Romance A renowned music teach...
6999    Musical, Romance When a ballroom dancer's shot...
7000    Drama, Family, Fantasy After the tragic deaths...
7001    Action, Comedy, Drama Raj is a successful lawy...
Length: 7002, dtype: object


In [9]:
# Converting the text data to feature vectors

vectorizer= TfidfVectorizer()

In [10]:
# Converting the text data to numerical data and storing it in feature_vectors
feature_vectors= vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 3423)	0.19119695543948215
  (0, 10312)	0.12942354023054423
  (0, 17690)	0.21343391681821647
  (0, 13641)	0.17829166558551757
  (0, 11798)	0.24739490972959716
  (0, 19978)	0.1681902498081409
  (0, 22777)	0.21777175266454715
  (0, 19270)	0.22555368809082663
  (0, 18113)	0.23773698167667004
  (0, 23465)	0.2214377402467426
  (0, 23129)	0.25705283778252425
  (0, 18151)	0.1663890459919386
  (0, 19536)	0.20782565518107574
  (0, 4402)	0.26100699479526396
  (0, 22690)	0.13944440730511737
  (0, 19697)	0.3793181182210651
  (0, 7766)	0.19752825003109759
  (0, 4231)	0.2319677051863774
  (0, 4982)	0.23378282466393033
  (0, 7312)	0.11997452095708566
  (0, 18547)	0.11984901975475071
  (0, 7118)	0.12334440404173615
  (0, 423)	0.10149411759056343
  (0, 316)	0.07647583584792739
  (1, 22327)	0.2301898357901241
  :	:
  (7001, 11233)	0.13355110978288
  (7001, 1112)	0.13926682999807027
  (7001, 6218)	0.160435069881696
  (7001, 18681)	0.13624047015037657
  (7001, 11956)	0.1781819463860982
  (7001, 10131

## Getting the similarity confidence value using cosine similarity

In [12]:
similarity = cosine_similarity(feature_vectors)

In [13]:
print(similarity)

[[1.         0.02766036 0.04529301 ... 0.         0.01117369 0.00421232]
 [0.02766036 1.         0.01471658 ... 0.         0.01474112 0.00850713]
 [0.04529301 0.01471658 1.         ... 0.         0.         0.00435244]
 ...
 [0.         0.         0.         ... 1.         0.03505647 0.03877713]
 [0.01117369 0.01474112 0.         ... 0.03505647 1.         0.11947789]
 [0.00421232 0.00850713 0.00435244 ... 0.03877713 0.11947789 1.        ]]


In [14]:
print(similarity.shape)

(7002, 7002)


In [15]:
# Getting the movie name from the user

movie_name = input('Enter your favourite movie name: ')

Enter your favourite movie name: batman


In [16]:
# Creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [17]:
# Finding the closest match for the movie name given by the user
# difflib helps to find the closest match,i.e, spelling mistake etc

movie_name=str(movie_name)
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Batman', 'Batman', 'Catwoman']


In [18]:
# Take the first and best match

close_match = find_close_match[0]
print(close_match)

Batman


In [19]:
# Find the index of the movie using the title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

1359


In [20]:
# Getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.020985577693062408), (1, 0.03443712619675433), (2, 0.01484285603767532), (3, 0.24838940062722598), (4, 0.005856438168495715), (5, 0.13174152731657304), (6, 0.0), (7, 0.005651181036911885), (8, 0.04488538106663393), (9, 0.1188760416567654), (10, 0.12428215895623405), (11, 0.014093645791706643), (12, 0.02037466855177474), (13, 0.005299085501793814), (14, 0.1497124294653725), (15, 0.014230785082219645), (16, 0.0055867061280851825), (17, 0.021430151431184905), (18, 0.032838046584505884), (19, 0.021118373329514068), (20, 0.020628821013728878), (21, 0.005602707137293466), (22, 0.01582112275383694), (23, 0.015789831706407544), (24, 0.03421582070746989), (25, 0.0), (26, 0.005970802161547372), (27, 0.005251715938696114), (28, 0.006212667537898782), (29, 0.006056188602303658), (30, 0.13166769642504209), (31, 0.0059232159239123085), (32, 0.10383758302441384), (33, 0.005512519787738734), (34, 0.0), (35, 0.03483648572595548), (36, 0.060102289396340554), (37, 0.01642690041782669), (38, 0.0205

In [21]:
len(similarity_score)

7002

In [22]:
# Sorting the movies based on their similarity score
# x represents the similarity_scores list and x[1] represents the score, x[0] represents the index

sorted_similar_movies = sorted(similarity_score, key= lambda x:x[1], reverse= True)
print(sorted_similar_movies)

[(1359, 1.0000000000000002), (428, 0.5588666804822157), (210, 0.3402668649565017), (3, 0.24838940062722598), (119, 0.24162959634614373), (65, 0.2343668748417418), (1512, 0.17911570735478768), (473, 0.171969821812829), (2530, 0.1719598203152914), (753, 0.16877279141614504), (813, 0.16379438678483954), (2313, 0.15260125811259803), (14, 0.1497124294653725), (1035, 0.1489342882748746), (438, 0.14409074572738437), (1803, 0.14397207132249207), (1296, 0.14258538221427844), (41, 0.13785605103368898), (2655, 0.13726396961944834), (4183, 0.13692563517163503), (299, 0.1324769320243038), (163, 0.13192987058214972), (5, 0.13174152731657304), (30, 0.13166769642504209), (3854, 0.1303070052666739), (2858, 0.12939877758398832), (278, 0.1278039409476912), (3630, 0.12467434508697987), (1141, 0.12451352308283758), (10, 0.12428215895623405), (870, 0.12353625433592429), (2805, 0.12264129556941092), (72, 0.12219805169649046), (1076, 0.12003692874927382), (2108, 0.11979929725673374), (9, 0.1188760416567654), 

In [23]:
# Print the name of the similar movies based on the index

print('Movies suggested for you: \n')
i=0

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index==index]['title'].values[0]
    if(i<30):
        print(i,'.',title_from_index)
        i+=1

Movies suggested for you: 

0 . Batman
1 . Batman Returns
2 . Batman & Robin
3 . The Dark Knight Rises
4 . Batman Begins
5 . The Dark Knight
6 . A History of Violence
7 . Mars Attacks!
8 . Beetlejuice
9 . The Sentinel
10 . Superman
11 . The Mask
12 . Man of Steel
13 . Jonah Hex
14 . Something's Gotta Give
15 . Blood and Wine
16 . Superman III
17 . Green Lantern
18 . Dungeons & Dragons: Wrath of the Dragon God
19 . Hang 'em High
20 . Batman Forever
21 . Watchmen
22 . Spider-Man 3
23 . Spider-Man 2
24 . Batman: The Dark Knight Returns, Part 2
25 . The Postman Always Rings Twice
26 . Planet of the Apes
27 . Jekyll and Hyde ... Together Again
28 . 8 Mile
29 . Superman Returns


## Movie Recommendation System

In [24]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : batman
Movies suggested for you : 

1 . Batman
2 . Batman Returns
3 . Batman & Robin
4 . The Dark Knight Rises
5 . Batman Begins
6 . The Dark Knight
7 . A History of Violence
8 . Mars Attacks!
9 . Beetlejuice
10 . The Sentinel
11 . Superman
12 . The Mask
13 . Man of Steel
14 . Jonah Hex
15 . Something's Gotta Give
16 . Blood and Wine
17 . Superman III
18 . Green Lantern
19 . Dungeons & Dragons: Wrath of the Dragon God
20 . Hang 'em High
21 . Batman Forever
22 . Watchmen
23 . Spider-Man 3
24 . Spider-Man 2
25 . Batman: The Dark Knight Returns, Part 2
26 . The Postman Always Rings Twice
27 . Planet of the Apes
28 . Jekyll and Hyde ... Together Again
29 . 8 Mile


In [31]:
import pickle
with open('movie_recommender.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump(movies_data, f)  # Save movies_data DataFrame
    pickle.dump(similarity, f)  # Save cosine similarity matrix
    pickle.dump(vectorizer, f)  # Save TF-IDF vectorizer

In [32]:
import pickle

# Load the pickled model objects
with open('movie_recommender.pkl', 'rb') as f:
    movies_data = pickle.load(f)
    similarity = pickle.load(f)
    vectorizer = pickle.load(f)

In [34]:
movie_name = "The Godfather"  # Replace with your desired movie title

# Find the closest match for the movie name
find_close_match = difflib.get_close_matches(movie_name, movies_data['title'].tolist())
close_match = find_close_match[0]

# Get the index of the movie
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

# Get similar movies based on similarity score
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

# Print similar movies (you can modify this to display recommendations differently)
print("Movies similar to", close_match, ":")
for movie in sorted_similar_movies[:5]:  # Limit to top 10 recommendations
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  print(title_from_index)

Movies similar to The Godfather :
The Godfather
The Godfather: Part III
The Godfather: Part II
Apocalypse Now
Closer
