In [1]:
import pandas as pd
import numpy as np
from tmdbv3api import TMDb
from tmdbv3api import Movie
from dotenv import load_dotenv
import os
import requests
from datetime import datetime

In [2]:
load_dotenv()

True

In [3]:
tmdb = TMDb()
tmdb.api_key = os.getenv('API_KEY')
tmdb.language = 'en'
tmdb.debug = True

In [4]:
movie = Movie()

In [5]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_American_films_of_2020')

In [8]:
names = []

### Extracting the names of the movies released in 2020

In [9]:
for i in range(2, 6):
    for i in data[i]['Title']:
        names.append(i)

In [11]:
movie_id = []
movie_not_found = []

- The TMDB API returns all the movies with the word "name" in it from our names list, we only want from 2020

In [15]:
date_format = "%Y-%m-%d"
start_date = datetime.strptime("2020-01-01", date_format)
end_date = datetime.strptime("2020-12-31", date_format)
for name in names:
    search = movie.search(name)
    for res in search:
        if(type(res) == str):
            print("Movie data does not exist")
            break
        if(res['release_date']):
            if (datetime.strptime(res['release_date'], date_format) >= start_date and datetime.strptime(res['release_date'], date_format) <= end_date):
                movie_id.append((name, res['id']))
                break      
        else :
            print("Does not exist")
    else :
        # print("Removed " + name + " from the names list")
        movie_not_found.append(name)
        # names.remove('name')

   

page
Does not exist
page
Does not exist


In [18]:
details = []

In [19]:
def get_genres(obj):
    genres = []

    # print(type(obj))
    for i in obj:
        # print(i['name'])
        genres.append(i['name'])
    return genres

### Extracting the essential details from the API response and storing them in a list for converting it to a DataFrame

In [20]:
for i in movie_id:

    url = f'https://api.themoviedb.org/3/movie/{i[1]}?api_key={os.getenv("API_KEY")}'
    response = requests.get(url)
    data_json = response.json()
    movie_details = {}
    movie_details['name'] = data_json['title']
    movie_details['id'] = i[1]
    movie_details['overview'] = data_json['overview'] 
    movie_details['genres'] = get_genres(data_json['genres'])
    details.append(movie_details)

In [22]:
df = pd.DataFrame(details)

In [23]:
df.head()

Unnamed: 0,name,id,overview,genres
0,Underwater,443791,After an earthquake destroys their underwater ...,"[Horror, Science Fiction, Action, Adventure]"
1,Like a Boss,526019,Two female friends with very different ideals ...,[Comedy]
2,The Murder of Nicole Brown Simpson,527534,In 1994 Nicole Brown Simpson and her friend Ro...,"[Thriller, Drama, Crime]"
3,Angels Fallen,662844,After the tragic loss of his wife battling the...,"[Horror, Fantasy, Action]"
4,Bad Boys for Life,38700,Marcus and Mike are forced to confront new thr...,"[Thriller, Action, Crime]"


In [24]:
def get_actors(id):
    url  =  f'https://api.themoviedb.org/3/movie/{id}/credits?api_key={os.getenv("API_KEY")}'
    resonse = requests.get(url)
    data_json = resonse.json()
    obj = data_json['cast']
    actors = []
    length = 5 if len(obj) > 5 else len(obj)
    for i in range(length):
        actors.append(obj[i]['name'])
    return actors

In [31]:

def get_directors(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={os.getenv("API_KEY")}'
    response = requests.get(url)

    if response.status_code == 200:
        try:
            data_json = response.json()
            obj = data_json['crew']
            directors = [i['name'] for i in obj if i['job'] == 'Director']
            return directors
        except Exception as e:
            print(f"Error: {e}")
            return []
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return []


In [27]:
df['actors'] = df['id'].apply(get_actors)

In [28]:
df.head()

Unnamed: 0,name,id,overview,genres,actors
0,Underwater,443791,After an earthquake destroys their underwater ...,"[Horror, Science Fiction, Action, Adventure]","[Kristen Stewart, Vincent Cassel, Mamoudou Ath..."
1,Like a Boss,526019,Two female friends with very different ideals ...,[Comedy],"[Tiffany Haddish, Rose Byrne, Salma Hayek Pina..."
2,The Murder of Nicole Brown Simpson,527534,In 1994 Nicole Brown Simpson and her friend Ro...,"[Thriller, Drama, Crime]","[Mena Suvari, Nick Stahl, Taryn Manning, Agnes..."
3,Angels Fallen,662844,After the tragic loss of his wife battling the...,"[Horror, Fantasy, Action]","[Houston Rhines, Michael Teh, Nicola Posener, ..."
4,Bad Boys for Life,38700,Marcus and Mike are forced to confront new thr...,"[Thriller, Action, Crime]","[Will Smith, Martin Lawrence, Paola Nuñez, Van..."


In [32]:
df['director'] = df['id'].apply(get_directors)

In [36]:
def remove_space(obj):
    l = []
    for i in obj:
        l.append(i.replace(" ", ""))
    return l

#### Creating a unique token for every token "Robert Downey Jr" -> "RobertDowneyJr"

In [37]:
df['actors'] = df['actors'].apply(remove_space)
df['overview'] = df['overview'].apply(lambda x : x.split())
df['director'] = df['director'].apply(remove_space)
df['tags'] = df['overview'] + df['genres'] + df['actors'] + df['director']
df['tags'] = df['tags'].apply(lambda x : " ".join(x))

In [38]:
df.head()

Unnamed: 0,name,id,overview,genres,actors,director,tags
0,Underwater,443791,"[After, an, earthquake, destroys, their, under...","[Horror, Science Fiction, Action, Adventure]","[KristenStewart, VincentCassel, MamoudouAthie,...",[WilliamEubank],After an earthquake destroys their underwater ...
1,Like a Boss,526019,"[Two, female, friends, with, very, different, ...",[Comedy],"[TiffanyHaddish, RoseByrne, SalmaHayekPinault,...",[MiguelArteta],Two female friends with very different ideals ...
2,The Murder of Nicole Brown Simpson,527534,"[In, 1994, Nicole, Brown, Simpson, and, her, f...","[Thriller, Drama, Crime]","[MenaSuvari, NickStahl, TarynManning, AgnesBru...",[DanielFarrands],In 1994 Nicole Brown Simpson and her friend Ro...
3,Angels Fallen,662844,"[After, the, tragic, loss, of, his, wife, batt...","[Horror, Fantasy, Action]","[HoustonRhines, MichaelTeh, NicolaPosener, LiJ...",[AliZamani],After the tragic loss of his wife battling the...
4,Bad Boys for Life,38700,"[Marcus, and, Mike, are, forced, to, confront,...","[Thriller, Action, Crime]","[WillSmith, MartinLawrence, PaolaNuñez, Vaness...","[AdilElArbi, BilallFallah]",Marcus and Mike are forced to confront new thr...


In [40]:
new_df = df[['id', 'name', 'tags']]

In [41]:
new_df

Unnamed: 0,id,name,tags
0,443791,Underwater,After an earthquake destroys their underwater ...
1,526019,Like a Boss,Two female friends with very different ideals ...
2,527534,The Murder of Nicole Brown Simpson,In 1994 Nicole Brown Simpson and her friend Ro...
3,662844,Angels Fallen,After the tragic loss of his wife battling the...
4,38700,Bad Boys for Life,Marcus and Mike are forced to confront new thr...
...,...,...,...
349,615677,We Can Be Heroes,When alien invaders capture Earth's superheroe...
350,581032,News of the World,A Texan traveling across the wild West bringin...
351,661914,One Night in Miami...,In the aftermath of Cassius Clay's defeat of S...
352,582014,Promising Young Woman,"A young woman, traumatized by a tragic event i..."


In [42]:
df = new_df.rename(columns = {'id' : 'movie_id', 'name' : 'title'})

In [43]:
df.head()

Unnamed: 0,movie_id,title,tags
0,443791,Underwater,After an earthquake destroys their underwater ...
1,526019,Like a Boss,Two female friends with very different ideals ...
2,527534,The Murder of Nicole Brown Simpson,In 1994 Nicole Brown Simpson and her friend Ro...
3,662844,Angels Fallen,After the tragic loss of his wife battling the...
4,38700,Bad Boys for Life,Marcus and Mike are forced to confront new thr...


In [44]:
df.to_csv('data_2020.csv', index = False)