In [67]:
import logging
import requests

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import trange

In [68]:
logging.basicConfig(filename='error.log', encoding='utf-8')

In [69]:
url = 'https://www.boxofficeindia.com/actor.php?actorid={}'
url_role = 'https://www.boxofficeindia.com/actor.php?actorid={}&role={}'

In [70]:
ACTOR_ROLE_ID = 24
DIR_ROLE_ID = 25
PROD_ROLE_ID = 26

In [71]:
def to_int(string):
    if string in ('---', '--', ''):
        return None
    return int(string.replace(',', ''))

In [72]:
def get_films(actor_id, actor_name, role, soup):
    table = soup.select('#yeartopim4')[0].table
    if not table:
        return []
    rows = table.find_all('tr', class_='grayrow boi-listing-rows', recursive=False)
    if not rows:
        return []
    films = []
    i = 0
    for row in rows:
        i += 1
        first_col = row.table.tr.find_all('td')[1]
        name = first_col.text
        movie_id = int(first_col.a['href'].split('=')[1])
        cols = row.find_all('td')
        gross = to_int(cols[-2].text)
        verdict = cols[-1].text
        films.append([actor_id, actor_name, role, movie_id, name, gross, verdict]) 
    return films

In [73]:
actors_data = []
for actor_id in trange(1, 19000):
    try:
        page = requests.get(url.format(actor_id))
        if not page.content:
            continue
        films = []
        soup = BeautifulSoup(page.content, 'html.parser')
        actor_name = soup.select('.centertext')[0].text
        roles = {r.text for r in soup.find(class_='movieboxssec').find_all('a')}
        current_role = soup.find(class_='movieboxssec').find('a', 
                        attrs={'style':'color: #FFFFFF; background-color: #cc3333; padding: 2px 6px; border-radius: 5px;'}).text
        ori_soup = soup
        if 'Actor' in roles:
            if current_role != 'Actor':
                page = requests.get(url_role.format(actor_id, ACTOR_ROLE_ID))
                soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Actor', soup))
            soup = ori_soup
        if 'Producer' in roles:
            if current_role != 'Producer':
                page = requests.get(url_role.format(actor_id, PROD_ROLE_ID))
                soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Producer', soup))
            soup = ori_soup
        if 'Director' in roles:
            if current_role != 'Director':
                page = requests.get(url_role.format(actor_id, DIR_ROLE_ID))
                soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Director', soup))
            soup = ori_soup

        if films:
            actors_data.extend(films)
        if actor_id % 1000 == 0:
            df = pd.DataFrame(actors_data, columns=['actor_id', 'actor_name', 'role', 'movie_id', 'movie_name', 'nett_gross', 'verdict'])
            with open('boxoffice_actors.csv', 'w') as boa:
                df.to_csv(boa, index=False)
    except Exception:
        logging.exception(f'Actor id: {actor_id}')

  0%|          | 0/18999 [00:00<?, ?it/s]

In [74]:
df = pd.DataFrame(actors_data, columns=['actor_id', 'actor_name', 'role', 'movie_id', 'movie_name', 'nett_gross', 'verdict'])

In [99]:
imdb_ids = pd.read_csv('boxoffice_to_imdb.csv', names=['movie_id', 'imdb_id'], skiprows=1, index_col='movie_id')

In [116]:
df['imdb_id'] = df['movie_id'].apply(lambda x: imdb_ids['imdb_id'].get(x, ''))

In [118]:
df = df[['actor_id', 'actor_name', 'role', 'movie_id', 'imdb_id', 'movie_name', 'nett_gross', 'verdict']]

In [121]:
with open('boxoffice_actors.csv', 'w') as boa:
    df.to_csv(boa, index=False)