In [2]:
import pandas as pd
import numpy as np
import pickle
import requests
import os
from tqdm import tqdm
import dotenv

pd.set_option('display.max_columns', None)

In [5]:
ids = pickle.load(open('./data/ids.pkl', 'rb'))
dotenv.load_dotenv()
api_key = os.environ['API_KEY']


### Plan

Create
1. movies_metadata -> [id, title, overview, adult, release_year, poster_url]
2. credits         -> [id, cast, director]
3. keywords        -> [id, keywords]


### movies_metadata

In [None]:
def create_movies_single_row(movie_id, data, resp):
    if resp.get('poster_path') != None:
        data['genres'].append([])

        for g in resp['genres']:
            data['genres'][-1].append(g['name'])

        data['id'].append(movie_id)
        data['title'].append(resp['title'])
        data['overview'].append(resp['overview'])
        data['adult'].append(resp['adult'])
        data['release_year'].append(resp['release_date'].split('-')[0])
        data['poster_url'].append('https://image.tmdb.org/t/p/w500'+resp['poster_path'])


In [None]:
movies_data = {
    'id': [],
    'title': [],
    'genres': [],
    'overview': [],
    'adult': [],
    'release_year': [],
    'poster_url': []
    
}

for id in tqdm(ids):
    url = f"http://api.themoviedb.org/3/movie/{id}?api_key={api_key}&language=en-US"
    resp = requests.get(url).json()

    create_movies_single_row(id, movies_data, resp)


100%|██████████| 33588/33588 [57:53<00:00,  9.67it/s]


In [None]:
movies = pd.DataFrame(movies_data)
movies.head()


Unnamed: 0,id,title,genres,overview,adult,release_year,poster_url
0,862,Toy Story,"[Animation, Adventure, Family, Comedy]","Led by Woody, Andy's toys live happily in his ...",False,1995,https://image.tmdb.org/t/p/w500/uXDfjJbdP4ijW5...
1,8844,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,False,1995,https://image.tmdb.org/t/p/w500/v2XHtmVqpERPy0...
2,15602,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,False,1995,https://image.tmdb.org/t/p/w500/1FSXpj5e8l4KH6...
3,31357,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",False,1995,https://image.tmdb.org/t/p/w500/kJokIbVDkd6Ywp...
4,11862,Father of the Bride Part II,"[Comedy, Family]",Just when George Banks has recovered from his ...,False,1995,https://image.tmdb.org/t/p/w500/rj4LBtwQ0uGrpB...


In [2]:
%pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastparquet
  Downloading fastparquet-2023.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.6.2 fastparquet-2023.4.0


In [None]:
movies.to_parquet('./movies_data.parquet')


In [3]:
movies = pd.read_parquet('./movies_data.parquet')
movies_ids = movies['id'].values

### credits

In [16]:
def create_credits_single_row(movie_id, data, resp):
    try:
        # taking the most popular director
        director = [c for c in resp['crew'] if c['job'] == 'Director']
        director = sorted(director, key=lambda x: x['popularity'])[::-1][0]
    except:
        return

    data['cast'].append([])
    data['director'].append([])

    data['director'][-1].append((director['name'], director['profile_path']))

    # taking the 5 most popular actors
    for c in sorted(resp['cast'], key=lambda x: x['popularity'])[::-1][:5]:
        data['cast'][-1].append((c['name'], c['profile_path']))


    data['id'].append(movie_id)


In [17]:
credits_data = {
    'id': [],
    'cast': [],
    'director': []
}

for id in tqdm(movies_ids):
    url = f"https://api.themoviedb.org/3/movie/{id}/credits?api_key={api_key}&language=en-US"
    resp = requests.get(url).json()

    create_credits_single_row(id, credits_data, resp)


100%|██████████| 33089/33089 [1:18:23<00:00,  7.03it/s]


In [18]:
credits_df = pd.DataFrame(credits_data)
credits_df.head()

Unnamed: 0,id,cast,director
0,862,"[(Tom Hanks, /xndWFsBlClOJFRdhSt4NBwiPq2o.jpg)...","[(John Lasseter, /gAVAZZHBa1v3gTcsWcBUwiHcyA0...."
1,8844,"[(Kirsten Dunst, /6RAAxI4oPnDMzXpXWgkkzSgnIAJ....","[(Joe Johnston, /fbGZo6CG9Z9zKFh8D5wHunyu7gJ.j..."
2,15602,"[(Sophia Loren, /1bBtumefzTEmhmglAICkfizUgwy.j...","[(Howard Deutch, /7mVbikJeAf2rJ23fFE2apzPJ7ch...."
3,31357,"[(Angela Bassett, /7Oz53NKdglRzAzI2MKjM3eQXwn....","[(Forest Whitaker, /fugyEeN6sisuYi07HSoxdVHm0C..."
4,11862,"[(Diane Keaton, /tnx7pJqisfAzvXOR5wHQsbnH9XH.j...","[(Charles Shyer, /stcwg4mJKGQo0UBfWcXVQy6fOdP...."


In [22]:
credits_df.to_parquet('./credits_data.parquet')


In [24]:
credits_ids = credits_df['id'].values
credits_ids

array(['862', '8844', '15602', ..., '111109', '67758', '227506'],
      dtype=object)

### keywords

In [28]:
def create_keywords_single_row(movie_id, data, resp):
    keywords = [k['name'] for k in resp['keywords']]
    if len(keywords) > 0:
        data['keywords'].append([*keywords])
        data['id'].append(movie_id)

keywords_data = {
    'id': [],
    'keywords': [],
}

for id in tqdm(credits_ids):
    url = f"https://api.themoviedb.org/3/movie/{id}/keywords?api_key={api_key}&language=en-US"
    resp = requests.get(url).json()

    create_keywords_single_row(id, keywords_data, resp)


100%|██████████| 33059/33059 [1:19:56<00:00,  6.89it/s]


In [29]:
keywords_df = pd.DataFrame(keywords_data)
keywords_df.head()

Unnamed: 0,id,keywords
0,862,"[martial arts, jealousy, friendship, bullying,..."
1,8844,"[giant insect, board game, jungle, disappearan..."
2,15602,"[fishing, halloween, sequel, old man, best fri..."
3,31357,"[based on novel or book, interracial relations..."
4,11862,"[parent child relationship, baby, midlife cris..."


In [30]:
keywords_df.to_parquet('./keywords_data.parquet')
