In [None]:
import logging
import requests

import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import trange

In [None]:
logging.basicConfig(filename='error.log', encoding='utf-8')

In [None]:
url = 'https://www.boxofficeindia.com/actor.php?actorid={}'
url_role = 'https://www.boxofficeindia.com/actor.php?actorid={}&role={}'

In [None]:
ACTOR_ROLE_ID = 24
DIR_ROLE_ID = 25
PROD_ROLE_ID = 26

In [None]:
def to_int(string):
    if string in ('---', '--', ''):
        return None
    return int(string.replace(',', ''))

In [None]:
def get_films(actor_id, actor_name, role, soup):
    table = soup.select('#yeartopim4')[0].table
    if not table:
        return None
    rows = table.find_all('tr', class_='grayrow boi-listing-rows', recursive=False)
    if not rows:
        return None
    films = []
    i = 0
    for row in rows:
        i += 1
        first_col = row.table.tr.find_all('td')[1]
        name = first_col.text
        movie_id = int(first_col.a['href'].split('=')[1])
        cols = row.find_all('td')
        gross = to_int(cols[-2].text)
        verdict = cols[-1].text
        films.append([actor_id, actor_name, role, movie_id, name, gross, verdict]) 
    return films

In [None]:
actors_data = []
for actor_id in trange(1, 10):
    try:
        page = requests.get(url.format(actor_id))
        if not page.content:
            continue
        films = []
        soup = BeautifulSoup(page.content, 'html.parser')
        actor_name = soup.select('.centertext')[0].text
        roles = {r.text for r in soup.find(class_='movieboxssec').find_all('a')}
        if 'Actor' in roles:
            page = requests.get(url_role.format(actor_id, ACTOR_ROLE_ID))
            soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Actor', soup))
        if 'Producer' in roles:
            page = requests.get(url_role.format(actor_id, PROD_ROLE_ID))
            soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Producer', soup))
        if 'Director' in roles:
            page = requests.get(url_role.format(actor_id, DIR_ROLE_ID))
            soup = BeautifulSoup(page.content, 'html.parser')
            films.extend(get_films(actor_id, actor_name, 'Director', soup))

        if films:
            actors_data.extend(films)
    except Exception:
        logging.execption(f'Actor id: {actor_id}')

In [None]:
df = pd.DataFrame(actors_data, columns=['actor_id', 'actor_name', 'role', 'movie_id', 'movie_name', 'nett_gross', 'verdict'])

In [None]:
with open('boxoffice_actors.csv', 'w') as boa:
    df.to_csv(boa, index=False)