In [43]:
# imports
import numpy as np 
import pandas as pd

In [44]:
# reading csv dataset file
movies = pd.read_csv("dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("dataset/tmdb_5000_credits.csv")

In [45]:
# merge csv files
movies = movies.merge(credits, on="title" )

In [46]:
# selecting relevent columns from dataset and removing all other columns

## original_language column have more than 90 % movies in eng only( highly imbalanced ) so wont take 
## original_title can be in different languages ( e.g chinese characters ) so wont take 
## popularity , release_date , revenue ,  is not selected since creating tags in content based system  wont take numerical columns
## tagline can be confusing

## irrelevent fields are budget , homepage ,  production_companies , status , spoken_languages

# movie_id is for getting movies posters from tmdb website
## finally selected columns are movie_id , genres , keywords , title ( always in eng character ) , overview , cast , crew

movies = movies[['movie_id','title','overview','genres', 'keywords','cast' , 'crew']]

In [47]:
# Overview has 3 missing values which is very less so we drop these records
movies.dropna( inplace = True )

In [48]:
# check duplicated data
movies.duplicated().sum()
## do nothing since there are no duplicates

0

# We create new dataframe with columns : movie_id , title , tags
## To create tags add genre , keywords , top 3 from cast and director from crew to  overview to get a paragraph

In [49]:
import ast 

# convert to list of genres for a movie record from python dictionary 
def convertToList(obj):
    genres = []
    for i in ast.literal_eval(obj):
        genres.append(i['name'])
    return genres

In [50]:
movies['genres']  = movies['genres'].apply(convertToList)

In [51]:
movies['genres']

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [52]:
movies['keywords'] = movies['keywords'].apply(convertToList)

In [53]:
# extract top 3 casts
def castToList(obj):
    casts = []
    counter = 1
    for i in ast.literal_eval(obj):
        if counter > 3:
            break ;
        casts.append(i['name'])
        counter += 1
    return casts

In [54]:
movies['cast'] = movies['cast'].apply(castToList)

In [55]:
# get director
def get_director(obj):
    L = []
    for i in ast.literal_eval(obj) :
        if i['job'] == 'Director':
            L.append(i['name'])
            break;
    return L

In [56]:
movies['crew'] = movies['crew'].apply(get_director)

In [57]:
# convert movies overview into array to join with other list columns
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [58]:
# remove white spaces for tag since Sam Worthington is 2 tags and Sam Mendes is also two tags sam is common 
# but it refers to different person both should be different tags so SamWorthington will be single tag and 
# SamMendes will be another tag

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [59]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [60]:
new_df = movies[['movie_id', 'title' ,'tags']]

In [61]:
# convert list to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [62]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [63]:
# apply stemming to text data 
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [64]:
# function to apply stemming to paragraph
def stem(text):
    words = []
    for word in text.split():
        words.append(ps.stem(word))
    return " ".join(words)

In [65]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [66]:
# convert the text to vector form 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words="english")

word_vectors = cv.fit_transform(new_df['tags']).toarray()

In [67]:
# using cosine similarity to find similarity between word vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity_metrix = cosine_similarity(word_vectors)

In [118]:
# gives 5 movies given a movie title



def recommend(movie_title):
    movie_title = movie_title
    movie_idx = new_df[new_df['title'] == movie_title].index[0]
    distances = similarity_metrix[movie_idx]
    movies_list = sorted(list(enumerate(distances)), reverse=True , key=lambda x: x[1] )[1: 6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)
        
recommend("John Carter")


Riddick
Krrish
The Other Side of Heaven
The Legend of Hercules
Get Carter


In [109]:
new_df.iloc[17]

movie_id                                                 1865
title             Pirates of the Caribbean: On Stranger Tides
tags        captain jack sparrow cross path with a woman f...
Name: 17, dtype: object