In [1]:
import numpy as np 
import pandas as pd
import ast
import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
df = movies.merge(credits,on='title')
df.head(1).columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [6]:
df = df[['movie_id','keywords','title','genres','overview','cast','crew']]
df.head(1)

Unnamed: 0,movie_id,keywords,title,genres,overview,cast,crew
0,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
df.isnull().sum()

movie_id    0
keywords    0
title       0
genres      0
overview    3
cast        0
crew        0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.duplicated().sum()

0

In [10]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L


In [11]:
df['genres'] = df['genres'].apply(convert)

In [12]:
df['keywords'] = df['keywords'].apply(convert)

In [13]:
def convert1(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if(cnt != 3):
            L.append(i['name'])
            cnt += 1
        else:
            break
    return L


In [14]:
df['cast'] = df['cast'].apply(convert1)

In [15]:
df.head(1)

Unnamed: 0,movie_id,keywords,title,genres,overview,cast,crew
0,19995,"[culture clash, future, space war, space colon...",Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [16]:
def fetch_dir(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L


In [17]:
df['crew'] = df['crew'].apply(fetch_dir)
df = df.rename(columns={'crew': 'director'})

In [18]:
df.head(1)

Unnamed: 0,movie_id,keywords,title,genres,overview,cast,director
0,19995,"[culture clash, future, space war, space colon...",Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [19]:
df['overview'] = df['overview'].apply(lambda x:x.split())

In [20]:
df.head(1)

Unnamed: 0,movie_id,keywords,title,genres,overview,cast,director
0,19995,"[culture clash, future, space war, space colon...",Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [21]:
df['genres'] = df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
df['cast'] = df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
df['director'] = df['director'].apply(lambda x:[i.replace(" ","") for i in x])
df.head(1)

Unnamed: 0,movie_id,keywords,title,genres,overview,cast,director
0,19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [22]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['director']
df = df.drop(['overview','keywords','cast','director','genres'],axis=1) 

In [24]:
df['tags'] = df['tags'].apply(lambda x:" ".join(x))

In [25]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

In [26]:
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


# Vectorisation

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [30]:
vectors = cv.fit_transform(df['tags']).toarray()

In [34]:
# steming - actions,actions -> action
!pip install nltk



In [35]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [36]:
def stem(text):
    y = []
    for i in text.split(): 
        y.append(ps.stem(i))

    return " ".join(y)

In [37]:
df['tags'] = df['tags'].apply(stem)

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
similarity = cosine_similarity(vectors)

# Main Function

In [47]:
def recommend(movie): 
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index] 
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list: 
        print(df.iloc[i[0]].title)