<a href="https://colab.research.google.com/github/rapp2043/comedian-mortality/blob/main/comedian_mortality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Web scraper program for list of comedians on Wikipedia
import requests
from bs4 import BeautifulSoup
import csv

def get_comedians_list(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    comedians = []

    # Find all the <a> tags within the <li> tags
    for li_tag in soup.find_all('li'):
        a_tag = li_tag.find('a')
        if a_tag and 'href' in a_tag.attrs:
            link = a_tag.attrs['href']
            if link.startswith('/wiki/'):
                # Construct the full URL
                full_url = "https://en.wikipedia.org" + link
                comedians.append({'name': a_tag.text, 'url': full_url})

    return comedians

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['name', 'url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

# URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_comedians'

# Get the list of comedians and their URLs
comedians_list = get_comedians_list(url)

# Save the data to a CSV file
save_to_csv(comedians_list, 'comedians.csv')

print("CSV file has been created with the list of comedians.")

CSV file has been created with the list of comedians.


In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to extract bday and dday from a Wikipedia URL
def extract_dates(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # will raise an HTTPError if the HTTP request returned an unsuccessful status code
        soup = BeautifulSoup(response.content, 'html.parser')

        bday_tag = soup.find('span', class_='bday')
        dday_tag = soup.find('span', class_='dday')

        bday = bday_tag.get_text() if bday_tag else None
        dday = dday_tag.get_text() if dday_tag else None

        return bday, dday
    except requests.exceptions.RequestException as e:
        print(f"Requests error for URL {url}: {e}")
        return None, None

# Input and output CSV filenames
input_csv = '/content/comedians.csv'
output_csv = 'comedian_birth_death_dates2.csv'

# Reading the input CSV and writing the extracted information to the output CSV
with open(input_csv, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:

    reader = csv.DictReader(infile)
    writer = csv.DictWriter(outfile, fieldnames=['name', 'url', 'birth_date', 'died_date'])
    writer.writeheader()

    for row in reader:
        print(f"Processing: {row['name']}")
        birth_date, died_date = extract_dates(row['url'])
        writer.writerow({'name': row['name'], 'url': row['url'], 'birth_date': birth_date, 'died_date': died_date})

In [None]:
# Extracting Birth and Death Dates from Individual Wikipedia Pages

import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define a function to extract birth and death dates from a Wikipedia page
def extract_dates(wiki_url):
    try:
        # Send a request to the Wikipedia page
        response = requests.get(wiki_url)
        response.raise_for_status()

        # Parse the response content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'lxml')

        # Try to extract the birth date from the 'bday' class
        bday_tag = soup.find('span', class_='bday')
        birth_date = bday_tag.get_text() if bday_tag else None

        # Try to extract the death date from the 'dday' class or by looking for 'Died' text
        death_date = None
        dday_tag = soup.find('span', class_='dday')
        if dday_tag:
            death_date = dday_tag.get_text()
        else:
            # Search for the 'Died' table header and try to extract the following text
            died_cell = soup.find('th', string=lambda text: text and 'Died' in text)
            if died_cell:
                death_date = died_cell.find_next('td').get_text().strip()

        return birth_date, death_date
    except requests.RequestException as e:
        print(f"Error fetching {wiki_url}: {e}")
        return None, None

# Read the input CSV using pandas
input_csv = '/content/comedians.csv'
output_csv = 'comedian_birth_death_dates2.csv'

comedian_list = pd.read_csv(input_csv)

# Prepare columns for the birth and death dates
comedian_list['birth_date'] = None
comedian_list['death_date'] = None

# Iterate over the dataframe rows
for index, row in comedian_list.iterrows():
    print(f"Processing {row['name']}...")
    birth_date, death_date = extract_dates(row['url'])
    comedian_list.at[index, 'birth_date'] = birth_date
    comedian_list.at[index, 'death_date'] = death_date

# Write the updated dataframe to a new CSV file
comedian_list.to_csv(output_csv, index=False)

print(f"Finished processing. Birth and death dates can be found in '{output_csv}'")