In [17]:
import re
import requests
import logging
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import trange

In [18]:
logging.basicConfig(filename='error.log', encoding='utf-8')

In [19]:
url = 'https://www.boxofficeindia.com/movie.php?movieid={}'

In [20]:
def get_table_data(soup, link):
    content = soup.find(href=link).parent.find_next_siblings('td')[1].text
    return clean_data(content)

In [21]:
def clean_data(data):
    if data.startswith('$'):
        data = data[1:]
    if data in ('---', '--', ''):
        return None
    return to_int(data)

In [22]:
def to_int(string):
    return int(string.replace(',', ''))

In [23]:
df = pd.DataFrame(columns=['movie_id', 'name', 'release_date', 'total_nett_gross', 'first_week', 'budget', 'india_gross', 'overseas_gross', 'worldwide_gross', 'all_time_rank', 'footfalls', 'adjusted_nett_gross'])

In [24]:
for movie_id in trange(1, 6092):
    try:
        page = requests.get(url.format(movie_id))
        soup = BeautifulSoup(page.content, "html.parser")
        if 'Error while fetching worldwide weekend' in soup.body.div.text:
            logging.error(f'Movie id {movie_id}\'s page not found.')
            continue
        release_date_string = soup.select('div.movieboxssec:nth-child(2) > span:nth-child(1)')[0].string.strip()
        movie_data = {
            'movie_id': movie_id,
            'name' : soup.select('.bl_tle_mvi > a:nth-child(1)')[0].text,
            'first_week' : get_table_data(soup, 'india-first-week.php'),
            'budget' : get_table_data(soup, 'budget.php'),
            'india_gross' : get_table_data(soup, 'india-total-gross.php'),
            'overseas_gross' : get_table_data(soup, 'overseas-total-gross.php'),
            'worldwide_gross' : get_table_data(soup, 'worldwide-total-gross.php'),
            'all_time_rank' : get_table_data(soup, 'all_time_rank.php?fm=1'),
            'footfalls' : get_table_data(soup, 'india-footfalls.php?fm=1'),
            'adjusted_nett_gross' : get_table_data(soup, 'india-adjusted-nett-gross.php?fm=1'),
            'total_nett_gross' : to_int(soup.find(href='net_box_office.php?fm=1').parent.contents[4]),
            'release_date' : datetime.strptime(release_date_string, '%d %b %Y')
        }
        df = df.append(movie_data, ignore_index=True)
    except Exception:
        logging.exception(f'Movie id: {movie_id}')
        continue

  0%|          | 0/6091 [00:00<?, ?it/s]

In [26]:
with open('boxoffice.csv', 'w') as bo:
    df.to_csv(bo, index=False)