This notebook scrapes budget information from imdb.com for the top 1000 lifetime adjusted box office grosses, found on boxofficemojo.com. It exports the data as a list of tuples to 'budget_list.txt'.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

In [2]:
page_number_url_list = list(range(0, 1000, 200))
page_number_url_list

[0, 200, 400, 600, 800]

In [3]:
mainpage_url_list = []
mainpage_url = "https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset="

for i in page_number_url_list:
    mainpage_url_list.append(mainpage_url + str(i))

mainpage_url_list

['https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=0',
 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=200',
 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=400',
 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=600',
 'https://www.boxofficemojo.com/chart/top_lifetime_gross_adjusted/?adjust_gross_to=2020&offset=800']

In [4]:
movie_id_list = []


for i in mainpage_url_list:
    mainpage = requests.get(i)
    soup = BeautifulSoup(mainpage.content, 'html.parser')    
    grandparent = soup.find('div', {'id': 'table'}).findAll('td', {'class': 'a-text-left mojo-field-type-title'})
    for i in grandparent:
        x = i.find('a', {'class':'a-link-normal'}).get('href')
        each_movie_mainpage_url = str('https://www.boxofficemojo.com' + x)
        first_slice = str(each_movie_mainpage_url.split('https://www.boxofficemojo.com/title/', 1)[1])
        second_slice = str(first_slice.split('/?ref_', 1)[0])
        movie_id_list.append(second_slice)

movie_id_list

['tt0031381',
 'tt0076759',
 'tt0059742',
 'tt0083866',
 'tt0120338',
 'tt0049833',
 'tt0073195',
 'tt0059113',
 'tt0070047',
 'tt0029583',
 'tt2488496',
 'tt0055254',
 'tt0080684',
 'tt0052618',
 'tt0499549',
 'tt4154796',
 'tt0086190',
 'tt0107290',
 'tt0120915',
 'tt0110357',
 'tt0070735',
 'tt0082971',
 'tt0061722',
 'tt0032455',
 'tt0068646',
 'tt0109830',
 'tt0058331',
 'tt0077631',
 'tt0848228',
 'tt0369610',
 'tt1825683',
 'tt0059800',
 'tt0468569',
 'tt0061852',
 'tt0053285',
 'tt4154756',
 'tt0087332',
 'tt0298148',
 'tt0145487',
 'tt0064115',
 'tt0066011',
 'tt0116629',
 'tt0099785',
 'tt2527336',
 'tt0032910',
 'tt0056937',
 'tt0086960',
 'tt0058150',
 'tt3606756',
 'tt0065377',
 'tt0069704',
 'tt0046247',
 'tt0383574',
 'tt0048960',
 'tt0034492',
 'tt0071230',
 'tt0096895',
 'tt0037536',
 'tt0167260',
 'tt0266543',
 'tt0072308',
 'tt3748528',
 'tt6105098',
 'tt0042332',
 'tt0316654',
 'tt0058385',
 'tt0044672',
 'tt0077975',
 'tt0335345',
 'tt0121766',
 'tt0088763',
 'tt01

In [6]:
movie_budget_list = []

for i in movie_id_list:
    imdb_url = 'https://www.imdb.com/title/'
    movie_url = str(imdb_url + i)
    mainpage = requests.get(movie_url)
    soup = BeautifulSoup(mainpage.content, 'html.parser') 
    try:
        grandparent = soup.find('div', {'id': 'titleDetails'}).find(text = 'Budget:').parent.parent.get_text()
        first_slice = str(grandparent.split('Budget:$', 1)[1])
        second_slice = str(first_slice.split('\n', 1)[0])
        budget = second_slice.replace(',', '')
        print(i, budget)
        movie_budget_list.append((i, budget))
    except:
        pass

with open('budget_list.txt', 'w') as outfile:
    json.dump(movie_budget_list, outfile)



tt0031381 3977000
tt0076759 11000000
tt0059742 8200000
tt0083866 10500000
tt0120338 200000000
tt0049833 13282712
tt0073195 7000000
tt0059113 11000000
tt0070047 11000000
tt0029583 1499000
tt2488496 245000000
tt0055254 4000000
tt0080684 18000000
tt0052618 15000000
tt0499549 237000000
tt4154796 356000000
tt0086190 32500000
tt0107290 63000000
tt0120915 115000000
tt0110357 45000000
tt0070735 5500000
tt0082971 18000000
tt0061722 3000000
tt0032455 2280000
tt0068646 6000000
tt0109830 55000000
tt0058331 6000000
tt0077631 6000000
tt0848228 220000000
tt0369610 150000000
tt1825683 200000000
tt0059800 9000000
tt0468569 185000000
tt0061852 4000000
tt0053285 6000000
tt4154756 321000000
tt0087332 30000000
tt0298148 150000000
tt0145487 139000000
tt0064115 6000000
tt0066011 2200000
tt0116629 75000000
tt0099785 18000000
tt2527336 317000000
tt0032910 2600000
tt0056937 44000000
tt0086960 14000000
tt0058150 3000000
tt3606756 200000000
tt0065377 10000000
tt0069704 777000
tt0046247 5000000
tt0383574 225000000

tt0071562 13000000
tt0381061 150000000
tt0110912 8000000
tt0086567 12000000
tt4630562 250000000
tt0097239 7500000
tt0398286 260000000
tt0098067 20000000
tt0110148 60000000
tt0275847 80000000
tt0479952 150000000
tt0816711 190000000
tt0974015 300000000
tt0181852 200000000
tt0169547 15000000
tt4912910 178000000
tt0112281 30000000
tt0063442 5800000
tt0143145 135000000
tt0076618 7500000
tt1981115 170000000
tt0107206 40000000
tt0074751 24000000
tt0398165 82000000
tt0079100 12500000
tt0117913 40000000
tt0090685 11000000
tt5095030 162000000
tt0099422 47000000
tt0119094 80000000
tt0295701 70000000
tt0113189 60000000
tt0105695 14400000
tt0072890 1800000
tt1872181 200000000
tt0454921 55000000
tt0110478 75000000
tt1661199 95000000
tt0107798 45000000
tt0087928 4500000
tt0097165 16400000
tt0120667 100000000
tt0102945 19000000
tt0458525 150000000
tt0831387 160000000
tt0073692 4000000
tt0075223 6500000
tt1727824 52000000
tt1216475 200000000
tt0304669 65000000
tt0116313 26000000
tt1517451 36000000
tt01

tt0081505 19000000
tt3450958 150000000
tt0112579 24000000
tt0094332 22000000
tt0369339 65000000
tt0092890 6000000
tt0113228 25000000
tt0113855 18000000
tt0361748 70000000
tt1130884 80000000
tt0338751 110000000
tt0120917 100000000
tt0206275 13000000
tt0119314 60000000
tt0083944 15000000
tt0120784 90000000
tt0064757 7000000
tt0289765 78000000
tt0115759 50000000
tt0436339 150000000
tt5884052 150000000
tt0096463 28000000
tt1599348 85000000
tt0079522 9000000
tt0083943 18000000
tt0075066 6000000
tt0252076 55000000
tt0074285 1800000
tt0120828 70000000
tt0099674 54000000
tt0119345 17000000
tt1650062 50000000
tt0397892 150000000
tt2357291 103000000
tt7131622 90000000
tt0107688 18000000
tt1606389 30000000
tt2180411 50000000
tt2802144 81000000
tt1091191 40000000
tt0405159 30000000
tt2872732 40000000
tt0277296 60000000
tt1192628 135000000
tt0089822 7600000
tt0313737 60000000
tt1446714 130000000
tt4972582 9000000
tt0066921 2200000
tt1229340 50000000
tt0071315 6000000
tt3794354 85000000
tt0454876 12