### Required Libraries

In [1]:
import pandas as pd
import requests
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

### Preprocessing stages

In [2]:
# calling the merged dataset
df = pd.read_csv('./dataset/mergedDataset.csv')
df.head(2)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,production_companies,release_date,runtime,tagline,title_x,vote_average,vote_count,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2009-12-10,162.0,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2007-05-19,169.0,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [3]:
# deleting the rows which has null values in the columns overview release date and runtime
df = df.dropna(axis=0, subset=['overview', 'release_date', 'runtime'])

In [4]:
# function for extracting 
def extract(val):
    return [values['name'].replace(' ', '') for values in eval(val)]

# function for extracting top 3 cast
def extract_top_3_cast(casts):
    count = 0
    cast_top_3 = []
    for cast in eval(casts):
        cast_top_3.append(cast['name'].replace(' ', ''))
        count += 1
        if count > 2:
            break
    return cast_top_3

# function to extract director 
def extract_director(crew):
    director = []
    for crew_member in eval(crew):
        if crew_member['job'] == 'Director':
            director.append(crew_member['name'])
            break
    return director

In [5]:
# extracting genre
df['genres'] = df['genres'].apply(extract)

# extracting keywords
df['keywords'] = df['keywords'].apply(extract)

# extracting overview
df['overview'] = df['overview'].apply(lambda x: x.split(' '))

# extracting the
df['production_companies'] = df['production_companies'].apply(extract)

# extracting tagline
df['tagline'] = df['tagline'].fillna('').apply(lambda x: x.split(' '))

# extracting top 3 cast
df['cast'] = df['cast'].apply(extract_top_3_cast)

# extract director 
df['director'] = df['crew'].apply(extract_director)

df = df.drop(['crew'], axis=1)
df.columns = [x if x != 'title_x' else 'title' for x in df.columns]
df.head(3)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,production_companies,release_date,runtime,tagline,title,vote_average,vote_count,cast,director
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[IngeniousFilmPartners, TwentiethCenturyFoxFil...",2009-12-10,162.0,"[Enter, the, World, of, Pandora.]",Avatar,7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,"[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[WaltDisneyPictures, JerryBruckheimerFilms, Se...",2007-05-19,169.0,"[At, the, end, of, the, world,, the, adventure...",Pirates of the Caribbean: At World's End,6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",en,"[A, cryptic, message, from, Bond’s, past, send...",107.376788,"[ColumbiaPictures, Danjaq, B24]",2015-10-26,148.0,"[A, Plan, No, One, Escapes]",Spectre,6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[Sam Mendes]


In [6]:
# assembling the contents
df['content'] = df['genres']+df['keywords']+df['overview']+df['production_companies']+df['tagline']+df['cast']+df['director'].apply(lambda x: [y.replace(' ', '') for y in x])
df['content'] = df['content'].apply(lambda x: ' '.join(x))
df.head(3)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,production_companies,release_date,runtime,tagline,title,vote_average,vote_count,cast,director,content
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[IngeniousFilmPartners, TwentiethCenturyFoxFil...",2009-12-10,162.0,"[Enter, the, World, of, Pandora.]",Avatar,7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[James Cameron],Action Adventure Fantasy ScienceFiction cultur...
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,"[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[WaltDisneyPictures, JerryBruckheimerFilms, Se...",2007-05-19,169.0,"[At, the, end, of, the, world,, the, adventure...",Pirates of the Caribbean: At World's End,6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[Gore Verbinski],Adventure Fantasy Action ocean drugabuse exoti...
2,"[Action, Adventure, Crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",en,"[A, cryptic, message, from, Bond’s, past, send...",107.376788,"[ColumbiaPictures, Danjaq, B24]",2015-10-26,148.0,"[A, Plan, No, One, Escapes]",Spectre,6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[Sam Mendes],Action Adventure Crime spy basedonnovel secret...


In [7]:
# further processing the content
nlp = spacy.load('en_core_web_sm')
def process_content(text):
    # changing to lower case
    text = text.lower()
    
    # removing punctuations
    text = ''.join([ch for ch in text if ch not in string.punctuation])

    # removing stop-words and lemmatizations
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc if token.text not in STOP_WORDS])

    return text


In [8]:
# processing the text
df['content'] = df['content'].apply(process_content)

In [9]:
df.head(3)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,production_companies,release_date,runtime,tagline,title,vote_average,vote_count,cast,director,content
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",en,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[IngeniousFilmPartners, TwentiethCenturyFoxFil...",2009-12-10,162.0,"[Enter, the, World, of, Pandora.]",Avatar,7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[James Cameron],action adventure fantasy sciencefiction cultur...
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,"[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[WaltDisneyPictures, JerryBruckheimerFilms, Se...",2007-05-19,169.0,"[At, the, end, of, the, world,, the, adventure...",Pirates of the Caribbean: At World's End,6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[Gore Verbinski],adventure fantasy action ocean drugabuse exoti...
2,"[Action, Adventure, Crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",en,"[A, cryptic, message, from, Bond’s, past, send...",107.376788,"[ColumbiaPictures, Danjaq, B24]",2015-10-26,148.0,"[A, Plan, No, One, Escapes]",Spectre,6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[Sam Mendes],action adventure crime spy basedonnovel secret...


In [10]:
df = df.drop(['genres', 'keywords', 'overview', 'production_companies', 'tagline', 'cast', 'original_language'], axis=1)
df.head(3)

Unnamed: 0,id,popularity,release_date,runtime,title,vote_average,vote_count,director,content
0,19995,150.437577,2009-12-10,162.0,Avatar,7.2,11800,[James Cameron],action adventure fantasy sciencefiction cultur...
1,285,139.082615,2007-05-19,169.0,Pirates of the Caribbean: At World's End,6.9,4500,[Gore Verbinski],adventure fantasy action ocean drugabuse exoti...
2,206647,107.376788,2015-10-26,148.0,Spectre,6.3,4466,[Sam Mendes],action adventure crime spy basedonnovel secret...


In [11]:
# getting the movie posters link
def get_poster_url(movie_id):
    base_api_url = 'https://api.themoviedb.org/3/movie'
    poster_base_url = 'https://image.tmdb.org/t/p/w500'

    params = {
        'api_key': '75c0ce739c7e34b95ae543dea4da50cb',
    }
    response = requests.get(f'{base_api_url}/{movie_id}', params=params)
    if response.status_code == 200:
        try:
            poster_file_name = response.json()['poster_path']
            poster_full_url = poster_base_url+poster_file_name
        except TypeError or KeyError:
            poster_full_url = ''

    else:
        poster_full_url = ''

    return poster_full_url

In [13]:
# getting the links to all the poster files
df['posters'] = df['id'].apply(get_poster_url)
df.head()

Unnamed: 0,id,popularity,release_date,runtime,title,vote_average,vote_count,director,content,posters
0,19995,150.437577,2009-12-10,162.0,Avatar,7.2,11800,[James Cameron],action adventure fantasy sciencefiction cultur...,https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...
1,285,139.082615,2007-05-19,169.0,Pirates of the Caribbean: At World's End,6.9,4500,[Gore Verbinski],adventure fantasy action ocean drugabuse exoti...,https://image.tmdb.org/t/p/w500/jGWpG4YhpQwVmj...
2,206647,107.376788,2015-10-26,148.0,Spectre,6.3,4466,[Sam Mendes],action adventure crime spy basedonnovel secret...,https://image.tmdb.org/t/p/w500/672kUEMtTHcaVY...
3,49026,112.31295,2012-07-16,165.0,The Dark Knight Rises,7.6,9106,[Christopher Nolan],action crime drama thriller dccomic crimefight...,https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...
4,49529,43.926995,2012-03-07,132.0,John Carter,6.1,2124,[Andrew Stanton],action adventure sciencefiction basedonnovel m...,https://image.tmdb.org/t/p/w500/lCxz1Yus07QCQQ...


In [14]:
# saving the preprocessed dataset
df.to_csv('./dataset/preProcessed.csv', index=False)