In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
credits_df = pd.read_csv('./data/credits.csv')
movies_df = pd.read_csv('./data/movies.csv')

##### Cleaning Data

In [3]:
movies_df = movies_df.merge(credits_df,on='title')
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
movies_df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [5]:
movies_df = movies_df[['movie_id','title','release_date','vote_average','overview','genres','keywords','cast','crew']]

In [6]:
movies_df.isnull().sum()

movie_id        0
title           0
release_date    1
vote_average    0
overview        3
genres          0
keywords        0
cast            0
crew            0
dtype: int64

In [7]:
movies_df.dropna(inplace=True)

In [8]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df['keywords'] = movies_df['keywords'].apply(convert)

In [9]:
def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter=counter+1
        else:
            break
    return L
    
movies_df['cast'] = movies_df['cast'].apply(convert3)

In [10]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

movies_df['crew']=movies_df['crew'].apply(fetch_director)

In [11]:
movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())

In [12]:
movies_df['genres'] = movies_df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [13]:
movies = movies_df[['movie_id','title','release_date','vote_average','overview','keywords']].copy()
movies['tags'] = movies_df['overview']+movies_df['genres']+movies_df['keywords']+movies_df['cast']+movies_df['crew']
movies['tags'] = movies['tags'].apply(lambda x:' '.join(x))
movies['tags'] = movies['tags'].apply(lambda X:X.lower())


In [14]:
pd.set_option('display.max_columns',None)
movies

Unnamed: 0,movie_id,title,release_date,vote_average,overview,keywords,tags
0,19995,Avatar,10-12-2009,7.2,"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,19-05-2007,6.9,"[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,26-10-2015,6.3,"[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...",a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,16-07-2012,7.6,"[Following, the, death, of, District, Attorney...","[dccomics, crimefighter, terrorist, secretiden...",following the death of district attorney harve...
4,49529,John Carter,07-03-2012,6.1,"[John, Carter, is, a, war-weary,, former, mili...","[basedonnovel, mars, medallion, spacetravel, p...","john carter is a war-weary, former military ca..."
...,...,...,...,...,...,...,...
4803,9367,El Mariachi,04-09-1992,6.6,"[El, Mariachi, just, wants, to, play, his, gui...","[unitedstates–mexicobarrier, legs, arms, paper...",el mariachi just wants to play his guitar and ...
4804,72766,Newlyweds,26-12-2011,5.9,"[A, newlywed, couple's, honeymoon, is, upended...",[],a newlywed couple's honeymoon is upended by th...
4805,231617,"Signed, Sealed, Delivered",13-10-2013,7.0,"[""Signed,, Sealed,, Delivered"", introduces, a,...","[date, loveatfirstsight, narration, investigat...","""signed, sealed, delivered"" introduces a dedic..."
4806,126186,Shanghai Calling,03-05-2012,5.7,"[When, ambitious, New, York, attorney, Sam, is...",[],when ambitious new york attorney sam is sent t...


In [15]:
movies.to_csv('./data/processed.csv', index=False)