In [2]:
import re
import json
import requests
from bs4 import BeautifulSoup
file_name = 'sams.txt'
data = {}

def extract_budget(string):
    try:
        return int("".join(re.findall(r'\b\d+\b', string)))
    except ValueError:
        return None

def extract_numbers(string):
    try:
        return int("".join(re.findall(r'\b\d+\b', string)))
    except ValueError:
        return None

# Currency conversions as of 8/1/2020
usd_from_GBP = 1.31
usd_from_NOK = 0.11
usd_from_DKK = 0.16
usd_from_AUD = 0.71
usd_from_EUR = 1.18
usd_from_MXN = 0.045

def find(soup, q_num):
    budget = None
    gross = None
    ww_gross = None
    rating = None
    # Find the rating
    for div in soup.findAll('div', class_='subtext'):
        rating = (div.text.split('\n')[1].replace(' ', '').replace('\n', ''))
    # Find the budget, gross, ww_gross in page
    for h4 in soup.findAll('h4'):
        if h4.text.startswith('Budget'):
            text = h4.parent.text
            # Split by currency
            if 'GBP' in text:
                text = text.split(' ')[0].replace('GBP', '')
                budget = int(extract_budget(text)*usd_from_GBP)
            elif 'NOK' in text:
                text = text.split(' ')[0].replace('NOK', '')
                budget = int(extract_budget(text)*usd_from_NOK)
            elif 'DKK' in text:
                text = text.split(' ')[0].replace('DKK', '')
                budget = int(extract_budget(text)*usd_from_DKK)
            elif 'AUD' in text:
                text = text.split(' ')[0].replace('AUD', '')
                budget = int(extract_budget(text)*usd_from_AUD)
            elif 'EUR' in text:
                text = text.split(' ')[0].replace('EUR', '')
                budget = int(extract_budget(text)*usd_from_EUR)
            elif 'MXN' in text:
                text = text.split(' ')[0].replace('MXN', '')
                budget = int(extract_budget(text)*usd_from_MXN)
            elif '$' in text:
                text = text.split(' ')[0]
                budget = extract_budget(text)
            else:
                print('failed', text)
            
        elif h4.text.startswith('Gross USA'):
            # Get the domestic gross
            text = h4.parent.text
            text = text.split(' ')[2]
            gross = extract_numbers(text)
            
        elif h4.text.startswith('Cumulative Worldwide'):
            # Get the Worldwide gross
            text = h4.parent.text
            text = text.split(' ')[3]
            ww_gross = extract_numbers(text)
    if budget or gross or ww_gross or rating:
        # Combine into dictionary
        new_data = {
            q_num:{
                'budget': budget,
                'gross': gross,
                'ww_gross': ww_gross,
                'rating': rating
                }
            }
        # Update main dictionary with data
        data.update(new_data)

url = "https://www.imdb.com/title/"
def get_soup(q_num):
    req = requests.get(str(url+q_num))
    return BeautifulSoup(req.content.decode(), 'html.parser')

def save(data):
    with open('data.json', 'w') as f:
        json.dump(data, f)

def main():
    with open(file_name, 'r') as f:
        q_list = f.read().split(',')


    x = 0
    for num in q_list:
        x += 1
        if x == 100:
            x = 0
            save(data)
        print('analysing', num)
        find(get_soup(num), num)

    save(data)

In [3]:
# Query to get data we scraped from
import pandasql as ps
def query():
    # Needs access to title_ratings to run
    q1 = """
    SELECT tconst FROM title_ratings
    JOIN title_basics
    USING(tconst)
    WHERE startYear > 2009
    AND (titleType = 'movie' OR titleType = 'tvMovie')
    """
    qlist = ''
    x = 0
    df = ps.sqldf(q1, locals())
    for i in df['tconst']:
        x+=1
        qlist += str(i+',')

    qlist += '\b'
    with open('qlist.txt', 'w') as f:
        f.write(qlist)

In [4]:
import os
import json
folder_name = 'all_data'
file_list = os.listdir(folder_name)

In [5]:
data = {}
for filename in file_list:
    with open((folder_name +'/'+filename), 'r') as f:
        data.update(json.load(f))
len(data)

31696

In [6]:
import pandas as pd
df = pd.DataFrame.from_dict(data)
df = df.transpose()
df = df.reset_index()
df = df.rename(columns={'index': 'tconst'})


In [7]:
# Save DataFrame to csv
df.to_csv('scraped_data/budget_ratings.csv')

In [8]:
import numpy as np
ratings = ['G', 'PG', 'PG-13', 'R']
df['rating'] = df['rating'].map(lambda x: 'NotRated' if x not in ratings else x)
df['budget'] = df['budget'].map(lambda x: np.NaN if x in[None, 0] else x)
df['gross'] = df['gross'].map(lambda x: np.NaN if x in[None, 0] else x)
df['ww_gross'] = df['ww_gross'].map(lambda x: np.NaN if x in[None, 0] else x)

In [9]:
df

Unnamed: 0,tconst,budget,gross,ww_gross,rating
0,tt3564924,150000.0,,,PG-13
1,tt3565112,,,,NotRated
2,tt3565174,,,191365.0,NotRated
3,tt3565406,,,,NotRated
4,tt3565434,40000.0,,,NotRated
...,...,...,...,...,...
31691,tt1610492,,,,NotRated
31692,tt1610516,,46623.0,46623.0,NotRated
31693,tt1610525,,320725.0,502518.0,R
31694,tt1610996,70000.0,,8555.0,R


In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
df

Unnamed: 0,tconst,budget,gross,ww_gross,rating
0,tt3564924,150000.0,,,PG-13
1,tt3565112,,,,NotRated
2,tt3565174,,,191365.0,NotRated
3,tt3565406,,,,NotRated
4,tt3565434,40000.0,,,NotRated
...,...,...,...,...,...
31691,tt1610492,,,,NotRated
31692,tt1610516,,46623.0,46623.0,NotRated
31693,tt1610525,,320725.0,502518.0,R
31694,tt1610996,70000.0,,8555.0,R


In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
df_drop_gross = df.drop(['gross'], axis=1).dropna()

sns.lmplot(data=df_drop_gross, x='budget', y='ww_gross')
df_drop_gross


Unnamed: 0,tconst,budget,ww_gross,rating
12,tt3567288,5000000.0,98450062.0,PG-13
23,tt3569230,30000000.0,42972994.0,R
24,tt3569326,800000.0,3861.0,NotRated
25,tt3569356,800000.0,36026.0,NotRated
33,tt3569978,590000.0,1575.0,NotRated
...,...,...,...,...
31672,tt1609479,5000000.0,120823.0,NotRated
31675,tt1609488,350000.0,40645.0,R
31676,tt1609492,2944000.0,2701266.0,NotRated
31685,tt1610395,826000.0,114.0,NotRated
