In [25]:
# TMDB Open API : TMDB API를 가져오는 파일
import requests
import json

In [26]:
# json 에서 API Key값 가져오기 : key, token
file_path = "./APIKey/TMDB.json"
with open(file_path, 'r') as file:
    api = json.load(file)

In [27]:
key = api['key']
token = api['token']

In [28]:
movies = "616037"
# movie info...
url = f"https://api.themoviedb.org/3/movie/{movies}?api_key={key}"
req = requests.get(url)
api_info = json.loads(req.text)

In [29]:
# title str값 => id값으로 정수 부여
# genres dict -> key(int 'id') : value(str 'name')
# country : 다른 것도 같이 확인 필요!!!
title = api_info['title']
country = api_info['production_countries'][0]['iso_3166_1']
overview = api_info['overview']

In [30]:
print("title: ", title, ", country: ", country)
print("overview: ", overview)
print("genres: ", api_info['genres'])

title:  Thor: Love and Thunder , country:  US
overview:  After his retirement is interrupted by Gorr the God Butcher, a galactic killer who seeks the extinction of the gods, Thor enlists the help of King Valkyrie, Korg, and ex-girlfriend Jane Foster, who now inexplicably wields Mjolnir as the Mighty Thor. Together they embark upon a harrowing cosmic adventure to uncover the mystery of the God Butcher’s vengeance and stop him before it’s too late.
genres:  [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}]


In [31]:
# cast, crew ...
url = f"https://api.themoviedb.org/3/movie/{movies}/credits?api_key={key}"
req = requests.get(url)
api_people = json.loads(req.text)

In [32]:
for person in api_people['crew']:
    try :  
        if(person['job'] == 'Director'):
            print(person)
    except:
        pass 

{'adult': False, 'gender': 2, 'id': 55934, 'known_for_department': 'Directing', 'name': 'Taika Waititi', 'original_name': 'Taika Waititi', 'popularity': 104.215, 'profile_path': '/tQeioTj98JxIXldV9yDSUXNt3KY.jpg', 'credit_id': '5d2e0eb4caab6d164099c274', 'department': 'Directing', 'job': 'Director'}


In [33]:
# id And title
# total_pages : 34599
# total_data : 691978
page = 1
url = f"https://api.themoviedb.org/3/discover/movie?page={page}&api_key={key}"
req = requests.get(url)
api_movie = json.loads(req.text)

In [34]:
for movie in api_movie['results']:
    print(movie['id'], movie['title'])

616037 Thor: Love and Thunder
507086 Jurassic World Dominion
438148 Minions: The Rise of Gru
766507 Prey
361743 Top Gun: Maverick
756999 The Black Phone
453395 Doctor Strange in the Multiverse of Madness
725201 The Gray Man
718789 Lightyear
762975 Purple Hearts
919355 Dragon Knight
759175 The Princess
924482 The Ledge
854467 Indemnity
634649 Spider-Man: No Way Home
961484 Last Seen Alive
614934 Elvis
728366 Borrego
675353 Sonic the Hedgehog 2
810693 Jujutsu Kaisen 0


- 영화 : 제목, 평가, 투표수, 장르, 국가, 개봉일, 감독, 출연, 등급, 상영시간, 줄거리, 이미지 경로
- 머신러닝 : 제목, 평가, 투표수, 장르, 국가, 감독, 출연

In [35]:
import numpy as np
from tqdm import tqdm
pages = np.arange(500) + 1

In [36]:
movie_list = []
for page in tqdm(pages):
    url = f"https://api.themoviedb.org/3/discover/movie?page={page}&api_key={key}"
    req = requests.get(url)
    api_movie = json.loads(req.text)
    for movie in api_movie['results']:
        movie_id = []
        movie_id.append(movie['id'])
        for movie in movie_id:
            url = f"https://api.themoviedb.org/3/movie/{movie}?api_key={key}"
            req = requests.get(url)
            api_info = json.loads(req.text)
            title= {movie : api_info['title']}
            try:
                poster_path = "https://image.tmdb.org/t/p/original" + api_info['poster_path']
            except:
                poster_path = ""
            release_date = api_info['release_date']
            budget = api_info['budget']
            genres = api_info["genres"]
            country = api_info['production_countries']
            vote_average = api_info['vote_average']
            vote_count = api_info['vote_count']
            runtime = api_info['runtime']
            overview = api_info['overview']
            url = f"https://api.themoviedb.org/3/movie/{movie}/credits?api_key={key}"
            req = requests.get(url)
            api_people = json.loads(req.text)
            for actor in api_people['cast']:
                try:
                    if(actor['order'] == 1):
                        main_charactor = {actor['id'] : actor['name']}
                except:
                    pass
            for crew in api_people['crew']:
                try :  
                    if(crew['job'] == 'Director'):
                        director = {crew['id'] : crew['name']}
                except:
                    pass 
            movie_list.append([title, poster_path, release_date, budget, genres, country,
                              main_charactor, director, runtime, overview, vote_average, vote_count])
            #del title, budget, genres, country, vote_average
            director = "-"
        del movie_id


100%|██████████| 500/500 [1:17:16<00:00,  9.27s/it]


In [37]:
import pandas as pd
df = pd.DataFrame(movie_list)
df.to_csv("data/movie_data_original.csv", index=False)

In [38]:
movie_df = pd.read_csv(
    "data/movie_data_original.csv", names="title, poster_path, release_date, budget, genres, country, main_charactor, director, runtime, overview, vote_average, vote_count".split(", "), skiprows = 1)
print("total movie_data_original: ", len(movie_df))
movie_df.head()

total movie_data_original:  10000


Unnamed: 0,title,poster_path,release_date,budget,genres,country,main_charactor,director,runtime,overview,vote_average,vote_count
0,{616037: 'Thor: Love and Thunder'},https://image.tmdb.org/t/p/original/pIkRyD18kl...,2022-07-06,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",{3894: 'Christian Bale'},{55934: 'Taika Waititi'},119,After his retirement is interrupted by Gorr th...,6.8,1795
1,{507086: 'Jurassic World Dominion'},https://image.tmdb.org/t/p/original/kAVRgw7GgK...,2022-06-01,165000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[{'iso_3166_1': 'US', 'name': 'United States o...",{18997: 'Bryce Dallas Howard'},{930707: 'Colin Trevorrow'},147,"Four years after Isla Nublar was destroyed, di...",7.109,2674
2,{438148: 'Minions: The Rise of Gru'},https://image.tmdb.org/t/p/original/wKiOkZTN9l...,2022-06-29,85000000,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...","[{'iso_3166_1': 'US', 'name': 'United States o...",{124747: 'Pierre Coffin'},{8023: 'Kyle Balda'},87,A fanboy of a supervillain supergroup known as...,7.808,1237
3,{766507: 'Prey'},https://image.tmdb.org/t/p/original/ujr5pztc1o...,2022-08-02,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",{3233354: 'Dakota Beavers'},{568322: 'Dan Trachtenberg'},100,"When danger threatens her camp, the fierce and...",8.235,1874
4,{361743: 'Top Gun: Maverick'},https://image.tmdb.org/t/p/original/62HCnUTziy...,2022-05-24,170000000,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",{996701: 'Miles Teller'},{86270: 'Joseph Kosinski'},131,After more than thirty years of service as one...,8.34,1870


In [39]:
# 감독이 없는 데이터 87개 제거
NoDirector = movie_df[movie_df['director'] == '-']
movie_df1 = movie_df.drop(NoDirector.index)
len(movie_df1)

9879

In [40]:
# 예산이 없는 데이터 4,604개 제거
NoBudget = movie_df1[movie_df1['budget'] == 0]
movie_df2 = movie_df1.drop(NoBudget.index)
len(movie_df2)

5091

In [41]:
# 장르가 없는 데이터 2개 제거
movie_df2 = movie_df2.drop(movie_df2[movie_df2['genres'].apply(lambda x: len(x) == 2)].index)
len(movie_df2)

5090

In [42]:
# 국가별이 없는 데이터 5개 제거
movie_df2 = movie_df2.drop(movie_df2[movie_df2['country'].apply(lambda x: len(x) == 2)].index)
len(movie_df2)

5080

In [46]:
# 머신러닝 : 제목, 평가, 투표수, 장르, 국가, 감독, 출연
movie_df2 = movie_df2[['title', 'budget', 'genres','country', 'main_charactor', 'director', 'vote_average', 'vote_count']]

In [47]:
movie_df2.to_csv("movie_data_final.csv", index=False)