In [1]:
import pandas as pd
import numpy as np
from tmdbv3api import TMDb
from tmdbv3api import Movie
from dotenv import load_dotenv
import os
import requests
from datetime import datetime

In [2]:
load_dotenv()

True

In [3]:
tmdb = TMDb()
tmdb.api_key = os.getenv('API_KEY')
tmdb.language = 'en'
tmdb.debug = True

In [4]:
movie = Movie()

In [5]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_American_films_of_2019')

In [9]:
# Extracting the names of the movies from the HTML
names = []

In [10]:
for i in range(2, 6):
    for i in data[i]['Title']:
        names.append(i)


In [12]:
movie_id = []
movie_not_found = []

In [13]:
# Gettting the tmdb id of the movies from their names
date_format = "%Y-%m-%d"
start_date = datetime.strptime("2019-01-01", date_format)
end_date = datetime.strptime("2019-12-31", date_format)
for name in names:
    search = movie.search(name)
    for res in search:
        if(res['release_date']):
            if (datetime.strptime(res['release_date'], date_format) >= start_date and datetime.strptime(res['release_date'], date_format) <= end_date):
                movie_id.append((name, res['id']))
                break      
        else :
            print("Does not exist")
    else :
        print("Removed " + name + " from the names list")
        movie_not_found.append(name)
        # names.remove('name')

Removed Replicas from the names list
Does not exist
Does not exist
Does not exist
Removed Piercing from the names list
Removed Dragged Across Concrete from the names list
Removed Unicorn Store from the names list
Removed Under the Silver Lake from the names list
Does not exist
Does not exist
Does not exist
Removed Family from the names list
Does not exist
Removed Buffaloed from the names list
Removed The Current War from the names list


In [17]:
for i in movie_not_found:
    names.remove(i)

In [20]:
len(names)

238

In [22]:
details = []

In [23]:
def get_genres(obj):
    genres = []

    # print(type(obj))
    for i in obj:
        # print(i['name'])
        genres.append(i['name'])
    return genres

In [24]:
for i in movie_id:

    url = f'https://api.themoviedb.org/3/movie/{i[1]}?api_key={os.getenv("API_KEY")}'
    response = requests.get(url)
    data_json = response.json()
    movie_details = {}
    movie_details['name'] = data_json['title']
    movie_details['id'] = i[1]
    movie_details['overview'] = data_json['overview'] 
    movie_details['genres'] = get_genres(data_json['genres'])
    details.append(movie_details)


In [26]:
df = pd.DataFrame(details)

In [27]:
df.head()

Unnamed: 0,name,id,overview,genres
0,Escape Room,522681,Six strangers find themselves in circumstances...,"[Horror, Thriller, Mystery]"
1,Rust Creek,561362,When an overachieving college senior makes a w...,"[Thriller, Drama, Action, Crime]"
2,American Hangman,567738,An unidentified man posts a live feed on socia...,[Thriller]
3,A Dog's Way Home,508763,"The adventure of Bella, a dog who embarks on a...","[Drama, Adventure, Family]"
4,The Upside,440472,Phillip is a wealthy quadriplegic who needs a ...,"[Comedy, Drama]"


In [28]:
def get_actors(id):
    url  =  f'https://api.themoviedb.org/3/movie/{id}/credits?api_key={os.getenv("API_KEY")}'
    resonse = requests.get(url)
    data_json = resonse.json()
    obj = data_json['cast']
    actors = []
    length = 5 if len(obj) > 5 else len(obj)
    for i in range(length):
        actors.append(obj[i]['name'])
    return actors

In [29]:
def get_directors(id):
    url  =  f'https://api.themoviedb.org/3/movie/{id}/credits?api_key={os.getenv("API_KEY")}'
    resonse = requests.get(url)
    data_json = resonse.json()
    obj = data_json['crew']
    directors = []
    for i in obj:
        if(i['job'] == 'Director'):
            directors.append(i['name'])
            break
    return directors

In [30]:
df['actors'] = df['id'].apply(get_actors)

In [31]:
df.head()

Unnamed: 0,name,id,overview,genres,actors
0,Escape Room,522681,Six strangers find themselves in circumstances...,"[Horror, Thriller, Mystery]","[Taylor Russell, Logan Miller, Jay Ellis, Debo..."
1,Rust Creek,561362,When an overachieving college senior makes a w...,"[Thriller, Drama, Action, Crime]","[Hermione Corfield, Jay Paulson, Sean O'Bryan,..."
2,American Hangman,567738,An unidentified man posts a live feed on socia...,[Thriller],"[Donald Sutherland, Vincent Kartheiser, Oliver..."
3,A Dog's Way Home,508763,"The adventure of Bella, a dog who embarks on a...","[Drama, Adventure, Family]","[Ashley Judd, Jonah Hauer-King, Edward James O..."
4,The Upside,440472,Phillip is a wealthy quadriplegic who needs a ...,"[Comedy, Drama]","[Kevin Hart, Bryan Cranston, Nicole Kidman, Go..."


In [32]:
df['director'] = df['id'].apply(get_directors)

In [33]:
df.head()

Unnamed: 0,name,id,overview,genres,actors,director
0,Escape Room,522681,Six strangers find themselves in circumstances...,"[Horror, Thriller, Mystery]","[Taylor Russell, Logan Miller, Jay Ellis, Debo...",[Adam Robitel]
1,Rust Creek,561362,When an overachieving college senior makes a w...,"[Thriller, Drama, Action, Crime]","[Hermione Corfield, Jay Paulson, Sean O'Bryan,...",[Jen McGowan]
2,American Hangman,567738,An unidentified man posts a live feed on socia...,[Thriller],"[Donald Sutherland, Vincent Kartheiser, Oliver...",[Wilson Coneybeare]
3,A Dog's Way Home,508763,"The adventure of Bella, a dog who embarks on a...","[Drama, Adventure, Family]","[Ashley Judd, Jonah Hauer-King, Edward James O...",[Charles Martin Smith]
4,The Upside,440472,Phillip is a wealthy quadriplegic who needs a ...,"[Comedy, Drama]","[Kevin Hart, Bryan Cranston, Nicole Kidman, Go...",[Neil Burger]


In [34]:
def remove_space(obj):
    l = []
    for i in obj:
        l.append(i.replace(" ", ""))
    return l

In [35]:
df['actors'] = df['actors'].apply(remove_space)
df['overview'] = df['overview'].apply(lambda x : x.split())
df['director'] = df['director'].apply(remove_space)
df['tags'] = df['overview'] + df['genres'] + df['actors'] + df['director']
df['tags'] = df['tags'].apply(lambda x : " ".join(x))

In [36]:
df.head()

Unnamed: 0,name,id,overview,genres,actors,director,tags
0,Escape Room,522681,"[Six, strangers, find, themselves, in, circums...","[Horror, Thriller, Mystery]","[TaylorRussell, LoganMiller, JayEllis, Deborah...",[AdamRobitel],Six strangers find themselves in circumstances...
1,Rust Creek,561362,"[When, an, overachieving, college, senior, mak...","[Thriller, Drama, Action, Crime]","[HermioneCorfield, JayPaulson, SeanO'Bryan, Mi...",[JenMcGowan],When an overachieving college senior makes a w...
2,American Hangman,567738,"[An, unidentified, man, posts, a, live, feed, ...",[Thriller],"[DonaldSutherland, VincentKartheiser, OliverDe...",[WilsonConeybeare],An unidentified man posts a live feed on socia...
3,A Dog's Way Home,508763,"[The, adventure, of, Bella,, a, dog, who, emba...","[Drama, Adventure, Family]","[AshleyJudd, JonahHauer-King, EdwardJamesOlmos...",[CharlesMartinSmith],"The adventure of Bella, a dog who embarks on a..."
4,The Upside,440472,"[Phillip, is, a, wealthy, quadriplegic, who, n...","[Comedy, Drama]","[KevinHart, BryanCranston, NicoleKidman, Golsh...",[NeilBurger],Phillip is a wealthy quadriplegic who needs a ...


In [37]:
new_df = df[['id', 'name', 'tags']]

In [38]:
df = new_df.rename(columns = {'id' : 'movie_id', 'name' : 'title'})

In [40]:
df.to_csv('data_2019.csv', index = False)