In [None]:
#Dependencies
import pandas as pd
import requests
import json
import time
from bs4 import BeautifulSoup

In [None]:
#Read the tsv into a DataFrame
id_df = pd.read_csv("../moviedata/title.akas.tsv.gz",compression="gzip",sep="\t",quotechar='"',error_bad_lines=False)

In [None]:
#Filters
id_filters_df = id_df.loc[(id_df["region"]=="US")&
                          (id_df["language"]=="en")]
id_filters_df

In [None]:
#Drop duplicate titleId's
id_filters_df.drop_duplicates(subset=["titleId"],inplace=True)
id_filters_df

In [None]:
#Get budget and gross data from IMDB and save it to a DataFrame
HEADERS = {"User-Agent": "Chrome/111.0 (PC; Windows 10)"}

movie_data = []

movieid_list = [movieid for movieid in id_filters_df["titleId"]]

record = 0

for movieid in movieid_list:
    
    try:
        
        url = f"https://www.imdb.com/title/{movieid}/"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text,"html.parser")
        money = soup.find("script", id="__NEXT_DATA__")
        json_object = json.loads(money.text)
        
        record += 1
        print(f"Movie: {record} of {len(movieid_list)}")
        
        title = json_object["props"]["pageProps"]["aboveTheFoldData"]["titleText"]["text"]
        
        try:
            mtype = json_object["props"]["pageProps"]["aboveTheFoldData"]["titleType"]["text"]
        except:
            mtype = ""
            print(f"No type")
        
        try:
            imdbbudget = json_object["props"]["pageProps"]["mainColumnData"]["productionBudget"]["budget"]["amount"]
        except:
            imdbbudget = ""
            print(f"No budget")
        
        try:
            wwgross = json_object["props"]["pageProps"]["mainColumnData"]["worldwideGross"]["total"]["amount"]
        except:
            wwgross = ""
            print(f"No gross")
            
        try:
            year = json_object["props"]["pageProps"]["aboveTheFoldData"]["releaseDate"]["year"]
            month = json_object["props"]["pageProps"]["aboveTheFoldData"]["releaseDate"]["month"]
            day = json_object["props"]["pageProps"]["aboveTheFoldData"]["releaseDate"]["day"]
            release = f"{year}-{month}-{day}"
        except:
            release = ""
            print(f"No date")
        
        try:
            runtime = json_object["props"]["pageProps"]["aboveTheFoldData"]["runtime"]["seconds"]
        except:
            runtime = ""
            print(f"No runtime")
        
        try:
            genre = json_object["props"]["pageProps"]["aboveTheFoldData"]["genres"]["genres"][0]["text"]
        except:
            genre = ""
            print(f"No genre")
        
        try:
            noms = json_object["props"]["pageProps"]["mainColumnData"]["prestigiousAwardSummary"]["nominations"]
        except:
            noms = ""
            print(f"No noms")
            
        try:
            mscore = json_object["props"]["pageProps"]["aboveTheFoldData"]["metacritic"]["metascore"]["score"]
        except:
            mscore = ""
            print(f"No mscore")
        
        try:
            imdbRating = json_object["props"]["pageProps"]["aboveTheFoldData"]["ratingsSummary"]["aggregateRating"]
        except:
            imdbRating = ""
            print(f"No rating")
        
        try:
            imdbVotes = json_object["props"]["pageProps"]["aboveTheFoldData"]["ratingsSummary"]["voteCount"]
        except:
            imdbVotes = ""
            print(f"No votes")
        
        movie_data.append({"Title":title,
                            "Type":mtype,
                            "Release":release,
                            "Runtime":runtime,
                            "Genre":genre,
                            "Nominations":noms,
                            "Metascore":mscore,
                            "imdbRating":imdbRating,
                            "imdbVotes":imdbVotes,
                            "imdbID":movieid,
                            "Budget":imdbbudget,
                            "Gross":wwgross})
        time.sleep(1)
        
    except:
        pass

In [None]:
#Export to csv
imdb_scrape = pd.DataFrame(movie_data)
imdb_scrape.to_csv("output_data/imdb_all_movies_data.csv",index=False)