# Movie Recommendation System with Python

In [1]:
#importing needed libraries

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#loading data

df = pd.read_csv('IMDB-Movie-Data.csv')

In [3]:
#checking head of dataset
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [4]:
#adding an ID to each movie
df['Movie_id'] = range(0,1000)

In [5]:
#checking the head again
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4


In [6]:
#checking basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
 12  Movie_id            1000 non-null   int32  
dtypes: float64(3), int32(1), int64(4), object(5)
memory usage: 97.8+ KB


In [7]:
#we have 1000 rows and 13 columns in the dataset
df.shape

(1000, 13)

In [8]:
#creating a list of important columns for building a recommendataion system

columns = ['Actors', 'Director', 'Genre', 'Title']

In [9]:
#dataset with important columns
df[columns]

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,"Animation,Comedy,Family",Sing
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,"Action,Adventure,Fantasy",Suicide Squad
...,...,...,...,...
995,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",Billy Ray,"Crime,Drama,Mystery",Secret in Their Eyes
996,"Lauren German, Heather Matarazzo, Bijou Philli...",Eli Roth,Horror,Hostel: Part II
997,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",Jon M. Chu,"Drama,Music,Romance",Step Up 2: The Streets
998,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",Scot Armstrong,"Adventure,Comedy",Search Party


In [10]:
#checking if there are any missing values in the dataset

df[columns].isnull().sum()

Actors      0
Director    0
Genre       0
Title       0
dtype: int64

In [11]:
# creating a function in order to create a column with all important features

def get_important_features(data):
    important_features = []
    for i in range(0,data.shape[0]):
        important_features.append(data['Actors'][i] + ' ' + data['Director'][i] + ' ' + data['Genre'][i] + ' ' + data['Title'][i])
    
    return important_features

In [12]:
df['important_features'] = get_important_features(df)

df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [13]:
#converting the text to a matrix of token counts

cm = CountVectorizer().fit_transform(df['important_features'])

In [14]:
#getting the cosine similarity matrix from the count matrix (cm)

cs = cosine_similarity(cm)

In [15]:
#each row and column represents a movie
cs

array([[1.        , 0.1767767 , 0.06085806, ..., 0.0571662 , 0.06537205,
        0.        ],
       [0.1767767 , 1.        , 0.        , ..., 0.        , 0.06933752,
        0.        ],
       [0.06085806, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0571662 , 0.        , 0.        , ..., 1.        , 0.06726728,
        0.        ],
       [0.06537205, 0.06933752, 0.        , ..., 0.06726728, 1.        ,
        0.07161149],
       [0.        , 0.        , 0.        , ..., 0.        , 0.07161149,
        1.        ]])

In [16]:
#printing shape of the cosine similarity matrix 

cs.shape

(1000, 1000)

In [17]:
#function to get top 5 similar movies

def get_top_5_similar_movies(movie_name):
    top5_list = []
    movie_id = df[df.Title == movie_name]['Movie_id'].values[0] #getting the ID of the movie
    scores = list(enumerate(cs[movie_id])) #creating a list of enumerations for the similarity score
    sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True) #sorting the scores from highest to lowest
    sorted_scores = sorted_scores[1:] #highest score will be the movie itself so not considering 0th index
    count = 0
    for item in sorted_scores:
        movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
        top5_list.append(movie_title)
        count +=1
        if count > 4:
            break    
    return top5_list

In [18]:
get_top_5_similar_movies('Split')

['Morgan',
 'The Conjuring',
 'The Visit',
 'Victor Frankenstein',
 'The VVitch: A New-England Folktale']

In [19]:
get_top_5_similar_movies('The Amazing Spider-Man')

['The Amazing Spider-Man 2',
 'Inferno',
 'The Host',
 'Spider-Man 3',
 'The Man from U.N.C.L.E.']