In [1]:
#!/usr/bin/env python
# coding: utf-8

# RCP average over time
# Load Python tools and Jupyter config

import re
import json
import requests
import pandas as pd
import time
import random
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

today = pd.Timestamp("today").strftime("%Y%m%d")

# Headers for requests
headers = {
    "accept": "application/json",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
}

# JSON endpoint that points to menu snapshot files via the Internet Archive
url = "https://web.archive.org/cdx/search/cdx?url=https://www.realclearpolling.com/polls/president/general/2024/trump-vs-harris&output=json"

# Request the metadata file in JSON format
r = requests.get(url).json()

# Read successful snapshots into a metadata dataframe
archive_src = pd.DataFrame(r, columns=r[0]).drop(0).query("statuscode == '200'")

In [2]:
# Clean up dates and url needed to request individual snapshots over time

archive_src["datetime"] = pd.to_datetime(archive_src["timestamp"])
archive_src["date"] = pd.to_datetime(archive_src["timestamp"]).dt.date

In [3]:
archive_src["url"] = (
    "https://web.archive.org/web/"
    + archive_src["timestamp"].astype(str)
    + "if_/"
    + archive_src["original"].astype(str)
)

In [4]:
archive_src["month_year"] = pd.to_datetime(archive_src["timestamp"]).dt.strftime(
    "%m-%Y"
)

In [5]:
# Clean up the dataframe, limiting it to just one snapshot per month
archive_df = (
    archive_src.sort_values(["datetime", "date"], ascending=True)
    .drop_duplicates("date", keep="last")
    .drop(["mimetype", "urlkey", "digest", "statuscode", "original"], axis=1)
).reset_index(drop=True)

# Create list of archive urls
archive_urls = archive_df["url"].to_list()

In [6]:
archive_past_df = archive_df.query('datetime < "2024-07-26"')

In [7]:
archive_recent_df = archive_df.query('datetime > "2024-07-26"')

In [8]:
archive_recent_urls = archive_recent_df["url"].to_list()
archive_past_urls = archive_past_df["url"].to_list()

In [9]:
# Function to extract candidate data from a single URL
def extract_candidate_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the date from the URL
        date_str = url.split('/')[4][:8]  # Extract the date string from the URL
        fetch_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
        
        # Initialize a dictionary to hold structured data for this page
        page_data = {
            "id": 7386,
            "type": "rcp_average",
            "polling_period": "",
            "polling_start_date": "",
            "polling_end_date": "",
            "spread_winner": "Trump",  # Assuming Trump is always the spread winner
            "fetch_date": fetch_date
        }

        # Find all relevant candidate blocks
        candidate_blocks = soup.find_all("div", class_="flex items-center gap-1")

        # Extract candidate names and percentages
        for block in candidate_blocks:
            # Get the candidate name from the <p> tag within the block
            name_tag = block.find("p", class_="text-body-2-bold")
            if name_tag:
                name = name_tag.get_text(strip=True)

            # Get the percentage from the sibling <p> tag with class "text-h1"
            percentage_tag = block.find_next("p", class_="text-h1")
            if percentage_tag:
                percentage = percentage_tag.get_text(strip=True).replace("%", "")

            # Add structured data to the dictionary
            page_data[f"{name.lower()}_value"] = float(percentage)

        return page_data

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there was an error

# List to store all extracted data from all pages
all_candidates_data = []

# Iterate over each URL in the archive
for url in tqdm(archive_past_urls):
    page_data = extract_candidate_data(url)
    if page_data:  # Ensure we only add valid data
        all_candidates_data.append(page_data)

    # Introduce a delay to prevent rate-limiting issues
    time.sleep(random.uniform(1, 3))  # Sleep for 1 to 3 seconds randomly

  0%|          | 0/23 [00:00<?, ?it/s]

In [13]:
# Processing functions
def clean_numeric(df):
    df["spread_value"] = pd.to_numeric(
        df["spread_value"].str.replace("+", "").str.strip().fillna("0"),
        errors="coerce",
    ).fillna(0)

    df["marginError"] = pd.to_numeric(
        df["marginError"].str.replace("+", "").str.strip().fillna("0"),
        errors="coerce",
    ).fillna(0)


def clean_polling_dates(df):
    df["polling_start_date"] = pd.to_datetime(
        df["data_start_date"], errors="coerce"
    ).dt.strftime("%Y-%m-%d")
    df["polling_end_date"] = pd.to_datetime(
        df["data_end_date"], errors="coerce"
    ).dt.strftime("%Y-%m-%d")


def clean_polling_period(df):
    df[["polling_start_date", "polling_end_date"]] = df["date"].str.split(
        " - ", expand=True
    )
    # Convert to datetime and format as %Y-%m-%d
    df["polling_start_date"] = pd.to_datetime(
        df["polling_start_date"] + "/2024"
    ).dt.strftime("%Y-%m-%d")

    df["polling_end_date"] = pd.to_datetime(
        df["polling_end_date"] + "/2024"
    ).dt.strftime("%Y-%m-%d")


# Function to extract JSON data directly
def extract_json_data(url):
    try:
        # Extract the date string from the URL for fetch_date
        date_str = url.split('/')[4][:8]  # Extract the first 8 characters after "web/"
        fetch_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"  # Format as YYYY-MM-DD

        # Construct the JSON URL using the date from the archive URL
        json_url = f"https://web.archive.org/web/{date_str}/https://www.realclearpolitics.com/poll/race/7386/polling_data.json"
        
        # Fetch the JSON data
        json_response = requests.get(json_url)
        json_response.raise_for_status()
        poll_data = json_response.json()["poll"]

        # Process each poll in the JSON data
        flattened_polls = []
        for individual_poll in poll_data:
            poll_info = {
                "id": individual_poll.get("id"),
                "type": individual_poll.get("type"),
                "pollster": individual_poll.get("pollster"),
                "date": individual_poll.get("date"),
                "data_start_date": individual_poll.get("data_start_date"),
                "data_end_date": individual_poll.get("data_end_date"),
                "sampleSize": individual_poll.get("sampleSize"),
                "marginError": individual_poll.get("marginError"),
                "link": individual_poll.get("link"),
                "spread_winner": individual_poll.get("spread", {}).get("name"),
                "spread_value": individual_poll.get("spread", {}).get("value"),
                "fetch_date": fetch_date,  # Set fetch_date based on archive date
            }

            # Extract nested candidate information
            for candidate in individual_poll.get("candidate", []):
                candidate_name = candidate.get("name")
                poll_info[f"{candidate_name.lower().replace(' ', '_')}_value"] = (
                    candidate.get("value")
                )

            # Add the processed poll information to the list
            flattened_polls.append(poll_info)

        return flattened_polls

    except requests.RequestException as e:
        print(f"Error fetching JSON from {url}: {e}")
        return None

# List to store all extracted data from recent JSONs
all_recent_data = []

# Iterate over each URL in the recent archive
for url in archive_recent_urls:
    polls_data = extract_json_data(url)
    if polls_data:  # Ensure we only add valid data
        all_recent_data.extend(polls_data)

    # Introduce a delay to prevent rate-limiting issues
    time.sleep(random.uniform(1, 3))  # Sleep for 1 to 3 seconds randomly

# Create a DataFrame from all the collected data
recent_df = pd.DataFrame(all_recent_data)

# Clean numeric and date fields
clean_numeric(recent_df)
clean_polling_dates(recent_df)

# Extract only the average polls for the timeseries
trend_df_recent = recent_df.query('type=="rcp_average"').copy()

# Clean polling period for the average DataFrame
clean_polling_period(trend_df_recent)

# Drop unnecessary columns
drop_cols = ["data_start_date", "data_end_date"]
trend_df_recent = trend_df_recent.drop(drop_cols, axis=1)

In [14]:
trend_df_past = pd.DataFrame(all_candidates_data)

In [23]:
full_archive_trend = pd.concat([trend_df_past, trend_df_recent])[['id', 'type', 'spread_winner', 'fetch_date', 'trump_value',
       'harris_value', 
       'spread_value']].fillna(0)

In [26]:
full_archive_trend['spread_value'] = full_archive_trend['trump_value'].astype(float) - full_archive_trend['harris_value'].astype(float)

In [29]:
full_archive_trend.query('fetch_date < "2024-08-01"')

Unnamed: 0,id,type,spread_winner,fetch_date,trump_value,harris_value,spread_value
0,7386,rcp_average,Trump,2024-02-21,49.3,42.7,6.6
1,7386,rcp_average,Trump,2024-02-24,49.3,42.7,6.6
2,7386,rcp_average,Trump,2024-03-12,49.3,42.7,6.6
3,7386,rcp_average,Trump,2024-04-20,49.3,42.7,6.6
4,7386,rcp_average,Trump,2024-05-15,49.3,42.7,6.6
5,7386,rcp_average,Trump,2024-06-28,49.3,42.7,6.6
6,7386,rcp_average,Trump,2024-06-29,49.3,42.7,6.6
7,7386,rcp_average,Trump,2024-06-30,49.3,42.7,6.6
8,7386,rcp_average,Trump,2024-07-02,46.0,44.0,2.0
9,7386,rcp_average,Trump,2024-07-03,46.0,44.0,2.0
