# Movie Recommendation Engine 

<b>- Importing the libraries</b>

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

<b>- Importing the dataset</b>

In [2]:
data = pd.read_csv("csv/imdb_english_movies.csv")
data.head()

Unnamed: 0,movie_id,title,movie_link,year,duration,genre,description,director,actors,rating
0,1,The Blues Brothers,https://www.imdb.com/title/tt0080455/,1980,133.0,"Action, Adventure, Comedy","Jake Blues, just released from prison, puts to...",John Landis,"John Belushi, Dan Aykroyd, Cab Calloway, John ...",7.9
1,2,The Shining,https://www.imdb.com/title/tt0081505/,1980,146.0,"Drama, Horror",A family heads to an isolated hotel for the wi...,Stanley Kubrick,"Jack Nicholson, Shelley Duvall, Danny Lloyd, S...",8.4
2,3,The Blue Lagoon,https://www.imdb.com/title/tt0080453/,1980,104.0,"Adventure, Drama, Romance","In the Victorian period, two children are ship...",Randal Kleiser,"Brooke Shields, Christopher Atkins, Leo McKern...",5.8
3,4,Cannibal Holocaust,https://www.imdb.com/title/tt0078935/,1980,95.0,"Adventure, Horror",During a rescue mission into the Amazon rainfo...,Ruggero Deodato,"Robert Kerman, Francesca Ciardi, Perry Pirkane...",5.9
4,5,Star Wars: Episode V - The Empire Strikes Back,https://www.imdb.com/title/tt0080684/,1980,124.0,"Action, Adventure, Fantasy",After the Rebels are brutally overpowered by t...,Irvin Kershner,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",8.7


<p style="color:red">Remove the rows where the <u>genre</u> / <u>actors</u> / <u>director</u> / <u>title</u> column is null. Here I already cleaned it and ready to built the recommendation model

<b>- merging the required columns into one column[features]</b>

In [3]:
def get_features(data):
    features = []
    for i in range(0, data.shape[0]):
        features.append(data["genre"][i] +' '+ data["actors"][i] +' '+ data["director"][i] +' '+  data["title"][i])
    return features

# storing the values in new column
data["features"] = get_features(data)
data["features"]

0       Action, Adventure, Comedy John Belushi, Dan Ay...
1       Drama, Horror Jack Nicholson, Shelley Duvall, ...
2       Adventure, Drama, Romance Brooke Shields, Chri...
3       Adventure, Horror Robert Kerman, Francesca Cia...
4       Action, Adventure, Fantasy Mark Hamill, Harris...
                              ...                        
4841    Drama, Romance, Sci-Fi Owen Wilson, Salma Haye...
4842    Action, Drama Tom Cruise, Jennifer Connelly, M...
4843    Horror, Thriller Alicia Sanz, Adan Canto, Will...
4844    Comedy, Fantasy, Romance Kathryn Newton, Kyle ...
4845    Drama, Romance Victoria Justice, Matthew Dadda...
Name: features, Length: 4846, dtype: object

In [4]:
# convert the text to a vector of token counts
cv = CountVectorizer()
text_matrix = cv.fit_transform(data["features"])

# measures the similarity between two vectors
cosine_sim = cosine_similarity(text_matrix, text_matrix)

# creating a new index
metadata = data.reset_index()

# storing the title of the movie with the index
indices = pd.Series(metadata.index, index=metadata['title'])

<b>- CountVectorizer

In [5]:
# total number of columns
len(cv.get_feature_names())

13852

In [6]:
# first row in feautres column
data['features'][0]

'Action, Adventure, Comedy John Belushi, Dan Aykroyd, Cab Calloway, John Candy John Landis The Blues Brothers'

In [7]:
# transformed text
cv.inverse_transform(text_matrix[0])

[array(['action', 'adventure', 'aykroyd', 'belushi', 'blues', 'brothers',
        'cab', 'calloway', 'candy', 'comedy', 'dan', 'john', 'landis',
        'the'], dtype='<U17')]

In [8]:
# transformed into tokens
text_matrix.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

<b>- Enter the exact English movie name without mistake or white space</b>

In [9]:
# Enter the movie name
title=input("Enter the movie name: ").title()

# store the index of the movie name entered 
idx = indices[title]

# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))

# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]

# Get the movie indices
movie_indices = [i[0] for i in sim_scores]

# Return the top 10 most similar movies
print("\nTop 10 Similar Movies of", title)
for i in movie_indices:
    print("-", data['title'].iloc[i])

Enter the movie name: iron man

Top 10 Similar Movies of Iron Man
- Iron Man 2
- Iron Man 3
- Spider-Man: Homecoming
- Chef
- The Avengers
- Tron
- Tron
- Avengers: Age Of Ultron
- Captain America: Civil War
- Avengers: Infinity War


<h3 style="color:blue">Hooray!</h3>