# IMDB Data Collection

Can we automate data collection from IMDB? 

## Scrape PBS Masterpiece for Cast Information

In [None]:
import requests
from bs4 import BeautifulSoup

# set homepage url
homepage_url = "https://www.pbs.org/wgbh/masterpiece/shows/grantchester/cast/"

# send a request for the HTML
response = requests.get(homepage_url)

# create soup 
soup = BeautifulSoup(response.content, "html.parser")

### Get Actor and Actress Images

Scrape the Grantchester homepage for actor/actress images

In [89]:
import re 

# pattern to match actor/actress name
pattern = r"(Actor|Actress)\s+(.*)\s+as\s+(.*)"

# find actor/actress images on the page
for img_tag in soup.find_all("img", class_="actor-img"):
    # get a source image link
    link = img_tag.get("data-src")
    if link:
        # get image link and actor/actress name from alt text
        alt_text = img_tag.get("alt")
        actor_actress = re.search(pattern, alt_text).group(2)
        image_name = f"{actor_actress}.jpg"

        # save the image to assets/masterpiece_images folder
        filepath = f"../assets/masterpiece_images/{image_name}"
        response = requests.get(link)
        if response.status_code == 200:
            with open(filepath, "wb") as file:
                file.write(response.content)

### Get Actor and Actress Metadata 

Grab other information made available by Masterpiece form the website and store them in dictionaries

In [86]:
# get links for each actor/actress in cast list
elements = soup.find_all("a", class_="text-link")

# instantiate a list to hold metadata
metadata_dicts = []

# iterate through them and get the information we want 
for element in elements:
    # pull the link to the about page 
    about_link = element.get("href")
    # extract text linking actor to character and split it 
    actor_as_character = element.find("h3", class_="title").text
    actor, character = actor_as_character.split(" as ")

    # get text from the about link and convert to soup
    about_response = requests.get(about_link)
    about_soup = BeautifulSoup(about_response.content, "html.parser")
    # find paragraphs and subset to the ones we want 
    paragraphs = about_soup.find_all("p")[3:7]

    # instantaite about texts var and fill it up 
    about_text = []
    for paragraph in paragraphs:
        about_text.append(paragraph.text) ## SM: do we want to clean up this text format?

    # create a dictionary 
    person_dict = {
        "actor_actress": actor, 
        "character": character, 
        "link": about_link, 
        "about": about_text
    }

    metadata_dicts.append(person_dict)

In [91]:
import json 

# set filepath
file_path = "../assets/masterpiece_metadata.json"

# Write the list of dictionaries to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(metadata_dicts, json_file, indent=4)

print("JSON data has been written to:", file_path)

JSON data has been written to: ../assets/masterpiece_metadata.json


## Process the IMDb Cast Table 

In [2]:
import requests 
from bs4 import BeautifulSoup

# set base url for shows actor/actress information
url = "https://www.imdb.com/title/tt3747572/fullcredits/?ref_=tt_ql_1"

# send request to get HTML 
response = requests.get(url)

# turn page into soup
soup = BeautifulSoup(response.content, "html.parser")

# find the cast table on the page
cast_table = soup.find("table", class_="cast_list")

In [3]:
import pandas as pd 

# instantiate a list to hold all data from the cast table 
data = []

# for each row in the cast table
for row in cast_table.find_all("tr"):
    # extract row data 
    row_data = row_data =  [td.text.strip() for td in row.find_all('td')]
    # provided the row actually has information in it...
    if len(row_data) > 1:
        # find actor/actress and photo page links and append to the data list 
        anchors = row.find_all("a")
        if anchors:
            all_links = [f"https://www.imdb.com{anchor.get("href")}" for anchor in anchors]
            actor_actress = all_links[0]
            images = all_links[2]
        data.append(row_data + [actor_actress] + [images])

# conver to a pandas data frame
df = pd.DataFrame(data, columns=["DROP", "actor_actress", "DROP", "character", "actor_actress_link", "photo_link"])

In [4]:
# function to split out character name from avaialble text
def extract_character(text):
    return text.split("\n")[0]

# apply function to row to get cahracter name
df["character"] = df["character"].apply(extract_character)

In [5]:
# preview dataframe
df.head()

Unnamed: 0,DROP,actor_actress,DROP.1,character,actor_actress_link,photo_link
0,,Robson Green,...,Geordie Keating,https://www.imdb.com/name/nm0338292/,https://www.imdb.com/title/tt3747572/character...
1,,Tessa Peake-Jones,...,Mrs. Chapman,https://www.imdb.com/name/nm0668854/,https://www.imdb.com/title/tt3747572/character...
2,,Al Weaver,...,Leonard Finch,https://www.imdb.com/name/nm1632403/,https://www.imdb.com/title/tt3747572/character...
3,,Kacey Ainsworth,...,Cathy Keating,https://www.imdb.com/name/nm0014735/,https://www.imdb.com/title/tt3747572/character...
4,,Tom Brittney,...,Will Davenport,https://www.imdb.com/name/nm5938631/,https://www.imdb.com/title/tt3747572/character...


In [6]:
df.tail()

Unnamed: 0,DROP,actor_actress,DROP.1,character,actor_actress_link,photo_link
444,,Christopher Hegarty,...,Peter Delaney,https://www.imdb.com/name/nm13344317/,https://www.imdb.com/title/tt3747572/character...
445,,Marian Lorencik,...,Farm Hand,https://www.imdb.com/name/nm6631007/,https://www.imdb.com/title/tt3747572/character...
446,,Alan D West,...,Art Critic,https://www.imdb.com/name/nm13934688/,https://www.imdb.com/title/tt3747572/character...
447,,Chris Robb,...,Jerry,https://www.imdb.com/name/nm8949666/,https://www.imdb.com/title/tt3747572/character...
448,,Georgina Frances Hart,...,Villager,https://www.imdb.com/name/nm12713496/,https://www.imdb.com/title/tt3747572/character...


In [8]:
df

Unnamed: 0,DROP,actor_actress,DROP.1,character,actor_actress_link,photo_link
0,,Robson Green,...,Geordie Keating,https://www.imdb.com/name/nm0338292/,https://www.imdb.com/title/tt3747572/character...
1,,Tessa Peake-Jones,...,Mrs. Chapman,https://www.imdb.com/name/nm0668854/,https://www.imdb.com/title/tt3747572/character...
2,,Al Weaver,...,Leonard Finch,https://www.imdb.com/name/nm1632403/,https://www.imdb.com/title/tt3747572/character...
3,,Kacey Ainsworth,...,Cathy Keating,https://www.imdb.com/name/nm0014735/,https://www.imdb.com/title/tt3747572/character...
4,,Tom Brittney,...,Will Davenport,https://www.imdb.com/name/nm5938631/,https://www.imdb.com/title/tt3747572/character...
...,...,...,...,...,...,...
444,,Christopher Hegarty,...,Peter Delaney,https://www.imdb.com/name/nm13344317/,https://www.imdb.com/title/tt3747572/character...
445,,Marian Lorencik,...,Farm Hand,https://www.imdb.com/name/nm6631007/,https://www.imdb.com/title/tt3747572/character...
446,,Alan D West,...,Art Critic,https://www.imdb.com/name/nm13934688/,https://www.imdb.com/title/tt3747572/character...
447,,Chris Robb,...,Jerry,https://www.imdb.com/name/nm8949666/,https://www.imdb.com/title/tt3747572/character...
