<a href="https://colab.research.google.com/github/prof-Anurag/Movies-Recommendation-System/blob/main/Movies_Recommendation_System_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd  # For handling the data (CSV file) and manipulating data structures.
import numpy as np   # For working with arrays and mathematical operations
import difflib   # To find close matches for movie titles based on similarity.
# #This converts text data into numerical vectors using TF-IDF (Term Frequency-Inverse Document Frequency) for feature extraction.
from sklearn.feature_extraction.text import TfidfVectorizer
# This measures the cosine similarity between vectors to evaluate how similar two items are.
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data = pd.read_csv('/content/movies.csv')
data.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [None]:
data.shape  # gives the number of rows and columns.

(4803, 24)

In [None]:
data.info()  # gives information about the data types and non-null counts.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [None]:
data.isnull().sum() # checks for missing values in each column.

Unnamed: 0,0
index,0
budget,0
genres,28
homepage,3091
id,0
keywords,412
original_language,0
original_title,0
overview,3
popularity,0


In [None]:
selected_features = ['genres','keywords','tagline','cast','director']  # A list of columns that will be used to combine text features
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [None]:
# Replacing the null values with null string.
for feature in selected_features:
  data[feature] = data[feature].fillna('') # This fills any missing (NaN) values in those columns with an empty string, ensuring no missing data when combining features.

In [None]:
# Combining all the 5 selected features.
# All selected features are combined into one string for each movie.
combined_features = data['genres']+' '+data['keywords']+' '+data['tagline']+' '+data['cast']+' '+data['director']
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [None]:
combined_features.name

In [None]:
# Convert 'combined_features' to string type and replace 'nan' with empty string
combined_features = combined_features.astype(str).str.replace('nan', '')


In [None]:
# Converting the text data to feature vectors.
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124266 stored elements and shape (4803, 17312)>
  Coords	Values
  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5271)	0.11108562744414445
  (0, 13593)	0.1036413987316636
  (0, 5435)	0.1036413987316636
  (0, 3676)	0.21392179219912877
  (0, 3063)	0.22208377802661425
  (0, 5832)	0.1646750903586285
  (0, 14372)	0.33962752210959823
  (0, 16581)	0.12549432354918996
  (0, 3223)	0.24960162956997736
  (0, 14265)	0.21392179219912877
  (0, 4942)	0.24025852494110758
  (0, 15255)	0.07095833561276566
  (0, 16992)	0.1282126322850579
  (0, 11185)	0.09049319826481456
  (0, 11497)	0.27211310056983656
  (0, 13343)	0.15021264094167086
  (0, 17001)	0.23643326319898797
  (0, 17284)	0.20197912553916567
  (0, 13313)	0.2177470539412484
  (0, 14058)	0.20596090415084142
  (0, 16662)	0.19843263965100372
  (0, 14602)	0.15150672398763912
  (0, 8751)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4832)	0.247137650

***Cosine Similarity***

In [None]:
# Getting the similarity score using cosine similarity.
# Calculates the pairwise cosine similarity between the feature vectors of all movies. This results in a square matrix where each cell represents the similarity score between two movies (higher score = more similar).
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [None]:
similarity.shape

(4803, 4803)

In [None]:
# Getting the movie name from the user.
# movie_name = input("Enter your favourite movie name :")

In [None]:
# Creating a list with all the movies names given in the dataset.
# list_of_all_titles = movies_data['title'].tolist()
# print(list_of_all_titles)

In [None]:
# Finding the close match for the movie name given by the user.
# find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
# print(find_close_match)

In [None]:
# close_match = find_close_match[0]
# print(close_match)

In [None]:
# Find the index of the movie with title.
# index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
# print(index_of_the_movie)

In [None]:
# Getting the list of similar movies.
# similarity_score = list(enumerate(similarity[index_of_the_movie]))
# print(similarity_score)

In [None]:
# len(similarity_score)

In [None]:
# Sorting the movies based on similarity score.
# sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
# print(sorted_similar_movies)

In [None]:
# Print the name of similar movies based on the index.
# print("Movies Suggested for you :\n")

# i=0
# for movie in sorted_similar_movies:
#   index = movie[0]
#   title_from_index = movies_data[movies_data.index == index]['title'].values[0]
#   if(i<5):  # we can increase the number of recommended movies by changing the integer value.
#     print(i,'.',title_from_index)
#     i+=1

## ***Movie Recommendation System.***

In [None]:
movie_name = input("Enter your favourite movie name :")

# A list of all movie titles in the dataset.
list_of_all_titles = data['title'].tolist()

# This function finds the closest match (or matches) to the entered movie name. It uses string similarity to do this.
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = data[data.title == close_match]['index'].values[0]

# This creates a list of tuples with the movie index and its similarity score to the selected movie.
similarity_score = list(enumerate(similarity[index_of_the_movie]))

# The movies are sorted in descending order by similarity score (most similar first).
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print("Movies Suggested for you :\n")

i=0
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = data[data.index == index]['title'].values[0]
  if(i<5):  # we can increase the number of recommended movies by changing the integer value.
    print(i,'.',title_from_index)
    i+=1

Enter your favourite movie name :teri
Movies Suggested for you :

0 . Water
1 . Earth
2 . Faith Connections
3 . Bride Wars
4 . The Bridge of San Luis Rey


# *END*

In [None]:
# Website of this model is not build yet.

In [None]:
import pickle
filename = 'movies_recomm_model.sav'
pickle.dump(similarity, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('movies_recomm_model.sav', 'rb'))

In [None]:
movie_name = input("Enter your favourite movie name :")
list_of_all_titles = data['title'].tolist()
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
close_match = find_close_match[0]
index_of_the_movie = data[data.title==close_match]['index'].values[0]
similarity_score = list(enumerate(loaded_model[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key=lambda x:x[1], reverse=True)
print("Here are some Recommended movies for you :\n")
i=0
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = data[data.index == index]['title'].values[0]
  if (i<5):
    print(i,'.',title_from_index)
    i+=1

Enter your favourite movie name :iron man
Here are some Recommended movies for you :

0 . Iron Man
1 . Iron Man 2
2 . Iron Man 3
3 . Avengers: Age of Ultron
4 . The Avengers


In [None]:
# END