# MovieMatch AI: A Movie Recommendation System Based on the TMDB Dataset 

### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

### Importing the .csv files

In [2]:
movies = pd.read_csv('Dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('Dataset/tmdb_5000_credits.csv')

### Data Analysis

In [3]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
movies.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [6]:
credits.describe()

Unnamed: 0,movie_id
count,4803.0
mean,57165.484281
std,88694.614033
min,5.0
25%,9014.5
50%,14629.0
75%,58610.5
max,459488.0


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [8]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [9]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [10]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

### Removing unnecessary features

In [11]:
movies = movies.merge(credits, on='title')

In [12]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [13]:
movies.shape

(4809, 7)

### Data Preprocessing

#### Function to retrieve all the names from a given input text

In [14]:
def convert(text):
    list_ = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            list_.append(i['name'])
        counter+=1
    return list_

#### Function to retrieve the director's name from a given input text

In [15]:
def fetch_director(text):
    list_ = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            list_.append(i['name'])
    return list_

#### Function to remove all the spaces from the extracted strings

In [16]:
def collapse(L):
    list_ = []
    for i in L:
        list_.append(i.replace(" ",""))
    return list_

In [17]:
movies.dropna(inplace=True)

In [18]:
temp = ['genres', 'keywords', 'cast']
for column in temp:
    movies[column] = movies[column].apply(convert)
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy]","[culture clash, future, space war]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island]","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama]","[dc comics, crime fighter, terrorist]","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion]","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [19]:
movies.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
4379,13132,All the Real Girls,"In a small North Carolina town, Paul, a womani...","[Drama, Romance]","[southern usa, small town, virgin]","[Paul Schneider, Zooey Deschanel, Danny McBride]","[{""credit_id"": ""52fe45439251416c7505082d"", ""de..."
2381,245703,Midnight Special,A father and son go on the run after the dad l...,"[Adventure, Drama, Science Fiction]","[father son relationship, helicopter, fbi]","[Michael Shannon, Jaeden Lieberher, Joel Edger...","[{""credit_id"": ""5723e69592514122a9001073"", ""de..."
2730,613,Downfall,"In April of 1945, Germany stands at the brink ...","[Drama, History, War]","[berlin, poison, dictator]","[Bruno Ganz, Alexandra Maria Lara, Corinna Har...","[{""credit_id"": ""5754589bc3a368346e001042"", ""de..."
4480,8416,The Conformist,A weak-willed Italian man becomes a fascist fl...,[Drama],"[paris, italy, assassin]","[Jean-Louis Trintignant, Stefania Sandrelli, G...","[{""credit_id"": ""52fe44a6c3a36847f80a2425"", ""de..."
1349,2155,Reindeer Games,After assuming his dead cellmate's identity to...,[Thriller],"[prison, robbery, role reversal]","[Ben Affleck, Gary Sinise, Charlize Theron]","[{""credit_id"": ""52fe433bc3a36847f804473f"", ""de..."


In [20]:
temp_ = ['genres', 'keywords', 'cast', 'crew']
for column in temp_:
    movies[column] = movies[column].apply(collapse)
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy]","[cultureclash, future, spacewar]","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[[, {, "", c, r, e, d, i, t, _, i, d, "", :, , ""..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[[, {, "", c, r, e, d, i, t, _, i, d, "", :, , ""..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent]","[DanielCraig, ChristophWaltz, LéaSeydoux]","[[, {, "", c, r, e, d, i, t, _, i, d, "", :, , ""..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama]","[dccomics, crimefighter, terrorist]","[ChristianBale, MichaelCaine, GaryOldman]","[[, {, "", c, r, e, d, i, t, _, i, d, "", :, , ""..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion]","[TaylorKitsch, LynnCollins, SamanthaMorton]","[[, {, "", c, r, e, d, i, t, _, i, d, "", :, , ""..."


In [21]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

### Create a new column 'tags' and append the contents from all the other columns

In [22]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

### Initializing a new dataframe

In [23]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [24]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


### Model Training

In [25]:
cv = CountVectorizer(max_features=4806, stop_words='english')

In [26]:
vector = cv.fit_transform(new['tags']).toarray()

In [27]:
vector.shape

(4806, 4806)

In [28]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.15430335, 0.08753762, ..., 0.        , 0.        ,
        0.        ],
       [0.15430335, 1.        , 0.08104409, ..., 0.03407991, 0.        ,
        0.        ],
       [0.08753762, 0.08104409, 1.        , ..., 0.02900074, 0.        ,
        0.        ],
       ...,
       [0.        , 0.03407991, 0.02900074, ..., 1.        , 0.07808688,
        0.05066946],
       [0.        , 0.        , 0.        , ..., 0.07808688, 1.        ,
        0.05407381],
       [0.        , 0.        , 0.        , ..., 0.05066946, 0.05407381,
        1.        ]])

In [29]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[1:6]:
        print(new.iloc[i[0]].title)

In [30]:
recommend('Batman')

Batman
Batman & Robin
The Dark Knight Rises
Batman Forever
Batman Begins


In [31]:
pickle.dump(new, open('movie_list.pkl','wb'))
pickle.dump(similarity, open('similarity.pkl','wb'))