In [60]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
movies = movies.merge(credits, on='title') #Merging the datasets

In [7]:
movies = movies[['movie_id', 'title', 'cast', 'crew', 'keywords', 'genres','overview']]

In [9]:
movies.isnull().sum()

movie_id    0
title       0
cast        0
crew        0
keywords    0
genres      0
overview    3
dtype: int64

In [12]:
movies.dropna(inplace=True)

movies.duplicated().sum()

0

## Preprocessing Data

In [13]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [17]:
def convert(objs):
    cleaned_objs = []
    for obj in ast.literal_eval(objs):
        cleaned_objs.append(obj['name'])
    return cleaned_objs

In [18]:
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [20]:
movies['genres'] = movies['genres'].apply(convert)

In [21]:
movies['keywords'] = movies['keywords'].apply(convert)

In [25]:
def convert_cast(objs):
    cleaned_objs = []
    counter = 0
    for obj in ast.literal_eval(objs):
        while counter<4:
            cleaned_objs.append(obj['name'])
            counter +=1
    return cleaned_objs

In [None]:
movies['cast']=movies['cast'].apply(convert_cast)

In [34]:
def fetch_director(objs):
    cleaned_objs = []
    for obj in ast.literal_eval(objs):
        if obj['job'] == 'Director':
            cleaned_objs.append(obj['name'])
    return cleaned_objs

In [36]:
movies['crew']=movies['crew'].apply(fetch_director)

In [38]:
movies['overview'] =movies['overview'].apply(lambda x:x.split())

In [41]:
#Removing space between words so that Sam Smith and Sam Unsmith are not the same Sam
movies['genres']=movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])

In [43]:
movies.head(1)

Unnamed: 0,movie_id,title,cast,crew,keywords,genres,overview
0,19995,Avatar,"[SamWorthington, SamWorthington, SamWorthingto...",[JamesCameron],"[cultureclash, future, spacewar, spacecolony, ...","[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin..."


In [44]:
movies['tags'] = movies['overview']+movies['genres']+movies['cast']+movies['crew']

In [45]:
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x))

In [47]:
movies['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction SamWorthington SamWorthington SamWorthington SamWorthington JamesCameron'

In [48]:
new_df = movies[['movie_id', 'title', 'tags']]

In [49]:
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [50]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [51]:
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


# Text Vectorization

In [71]:
ps = PorterStemmer()

def stem(text):
    y=  []

    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [72]:
new_df['tags']= new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(stem)


In [87]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [88]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [89]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [102]:
def recommend(movie):
    movie_index = new_df[new_df['title']== movie].index[0]
    distances = similarity[movie_index]
    sim_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for movie in sim_movies:
        print(new_df.iloc[movie[0]].title)


In [104]:
recommend("Transformers")

Transformers: Revenge of the Fallen
Transformers: Dark of the Moon
The Greatest Game Ever Played
Eagle Eye
The Battle of Shaker Heights
