In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on='title');

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [5]:
movies=movies[['genres', 'movie_id', 'keywords', 'overview', 'title', 'cast', 'crew']]

In [6]:
movies.isnull().sum()

genres      0
movie_id    0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [8]:
movies.duplicated().sum()

0

In [9]:
import ast

In [10]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L        

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [13]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L   

In [14]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [15]:
def convert2(obj):
    L=[]
    c=0;
    for i in ast.literal_eval(obj):
        L.append(i['name'])
        c+=1
        if c==5:
            break
    return L  

In [16]:
movies['cast'] = movies['cast'].apply(convert2)

In [17]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [18]:
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_df = movies[['movie_id' , 'title', 'tags']]

In [21]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
new_df['tags'] = new_df['tags'].apply(lambda x:[i.lower() for i in x])

In [24]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
new_df.reset_index(inplace=True)

In [25]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [26]:
def stem_text(text):
    v = []
    for i in text.split():
        v.append(ps.stem(i))
    return " ".join(v)        

In [27]:
new_df['tags'] = new_df['tags'].apply(stem_text)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [29]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [31]:
def recommend_movie(movie):
    movie_index=new_df[new_df['title']==movie].index[0]
    distances = similarity[movie_index]
    top_ten = sorted(list(enumerate(distances)), key=lambda x:x[1], reverse=True)[1:11]
    print("Top Recommendations:")
    for i in top_ten:
        print(new_df.iloc[i[0]]['title'])      

In [32]:
[i for i in new_df.title]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [33]:
recommend_movie("A Christmas Carol")

Top Recommendations:
Should've Been Romeo
Flying By
The Diving Bell and the Butterfly
1982
The Adventures of Elmo in Grouchland
Charlie St. Cloud
How to Fall in Love
The R.M.
Gran Torino
The Truman Show


In [34]:
recommend_movie("The Legend of Hercules")

Top Recommendations:
The Adventures of Elmo in Grouchland
1982
The R.M.
Juliet and Alfa Romeo
The Diving Bell and the Butterfly
Flying By
Me You and Five Bucks
Krrish
Should've Been Romeo
Charlie St. Cloud


In [35]:
recommend_movie("Gravity")

Top Recommendations:
Moonraker
The Right Stuff
The Martian
Space Pirate Captain Harlock
Apollo 13
Deep Impact
The Astronaut's Wife
Armageddon
The Astronaut Farmer
Capricorn One


In [37]:
recommend_movie("Baby's Day Out")

Top Recommendations:
Superbabies: Baby Geniuses 2
Baby Geniuses
Three Men and a Baby
Neighbors
Tsotsi
Jimmy Neutron: Boy Genius
30 Minutes or Less
L!fe Happens
Son of the Mask
Instructions Not Included


In [38]:
import pickle

In [40]:
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))

In [41]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))