In [1]:
# Basic libraries we need
import requests
import pandas as pd

In [2]:
# Tell the RealClearPolitics server you're a browser
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}

In [3]:
# Today's date in a text format, i.e. "August 2"
today = pd.Timestamp('today').strftime('%B %-d')

In [4]:
# URL of the poll json file
url = 'https://www.realclearpolitics.com/poll/race/7386/polling_data.json'

In [5]:
# Request it and get a response
response = requests.get(url, headers=headers)

In [6]:
# Strip out meta and config stuff from the response and just isolate the polls list
# Each poll -- each row in a spreadsheet, you might say -- is a nested dictionary inside this "poll" list, or key, in the json
poll_data = response.json()['poll']

In [7]:
# Loop through that list of polls, extracting items we want, and then store them in a list of dictionaries for Pandas to read as a dataframe
flattened_polls = []

for individual_poll in poll_data:
    # Extract items from polling jsons
    poll_info = {
        "id": individual_poll.get("id"),
        "type": individual_poll.get("type"),
        "pollster": individual_poll.get("pollster"),
        "date": individual_poll.get("date"),
        "data_start_date": individual_poll.get("data_start_date"),
        "data_end_date": individual_poll.get("data_end_date"),
        "sampleSize": individual_poll.get("sampleSize"),
        "marginError": individual_poll.get("marginError"),
        "link": individual_poll.get("link"),
        "spread_winner": individual_poll.get("spread", {}).get("name"),
        "spread_value": individual_poll.get("spread", {}).get("value"),
    }

    # Extract nested candidate information into columns
    for candidate in individual_poll.get("candidate", []):
        candidate_name = candidate.get("name")
        poll_info[f"{candidate_name.lower().replace(' ', '_')}_value"] = (
            candidate.get("value")
        )

    # Store you dictionaries for each poll into a list
    flattened_polls.append(poll_info)
    
    # One big dataframe with all the polls by date
    all_df = pd.DataFrame(flattened_polls)

In [8]:
# Read them as dataframes, filtering for the types we want: The RCP average, and everything else
just_average = all_df.query('type == "rcp_average"').copy()
just_polls = all_df.query('type != "rcp_average"').copy()

In [9]:
# The ones with type "poll_rcp_avg" are individual polls but tagged this way to indicate they were used to calculate the average. 
# They are the most recent, distinct polls from pollsters they trust enough to use in the average. 
just_polls.head(20)

Unnamed: 0,id,type,pollster,date,data_start_date,data_end_date,sampleSize,marginError,link,spread_winner,spread_value,trump_value,harris_value
1,146390,poll_rcp_avg,Daily Kos/Civiqs,7/27 - 7/30,2024/07/27,2024/07/30,1123 RV,3.0,https://civiqs.com/documents/Civiqs_DailyKos_b...,Harris,4.0,45,49
2,146395,poll_rcp_avg,Rasmussen Reports,7/24 - 7/31,2024/07/24,2024/07/31,2163 LV,2.0,https://www.rasmussenreports.com/public_conten...,Trump,5.0,49,44
3,146332,poll_rcp_avg,Reuters/Ipsos,7/26 - 7/28,2024/07/26,2024/07/28,879 RV,3.5,https://www.reuters.com/world/us/harris-trump-...,Harris,1.0,42,43
4,146302,poll_rcp_avg,Harvard-Harris,7/26 - 7/28,2024/07/26,2024/07/28,2196 RV,2.1,https://harvardharrispoll.com/crosstabs-july-3/,Trump,4.0,52,48
5,146324,poll_rcp_avg,Morning Consult,7/26 - 7/28,2024/07/26,2024/07/28,11538 RV,1.0,https://pro.morningconsult.com/trackers/2024-p...,Harris,1.0,46,47
6,146279,poll_rcp_avg,Wall Street Journal,7/23 - 7/25,2024/07/23,2024/07/25,1000 RV,3.1,https://www.wsj.com/politics/elections/harris-...,Trump,2.0,49,47
7,146250,poll_rcp_avg,Forbes/HarrisX,7/22 - 7/25,2024/07/22,2024/07/25,3013 RV,1.8,https://www.realclearpolitics.com/docs/2024/Ha...,Trump,2.0,51,49
8,146229,poll_rcp_avg,NY Times/Siena,7/22 - 7/24,2024/07/22,2024/07/24,1142 LV,,https://www.nytimes.com/interactive/2024/07/25...,Trump,1.0,48,47
9,146217,poll,Rasmussen Reports,7/22 - 7/24,2024/07/22,2024/07/24,1074 LV,3.0,https://www.rasmussenreports.com/public_conten...,Trump,7.0,50,43
10,146230,poll,Morning Consult,7/22 - 7/24,2024/07/22,2024/07/24,11297 RV,1.0,https://pro.morningconsult.com/trackers/2024-p...,Harris,1.0,45,46


In [10]:
# There are many individual polls in this contest
len(just_polls)

62

In [11]:
# This is just the latest average. There is only one average, obviously. 
just_average

Unnamed: 0,id,type,pollster,date,data_start_date,data_end_date,sampleSize,marginError,link,spread_winner,spread_value,trump_value,harris_value
0,7386,rcp_average,rcp_average,7/22 - 7/31,,,,,,Trump,1.2,47.7,46.5


In [12]:
# State of the race: point spread and winner
spread = float(just_average['spread_value'].astype(float).iloc[0])
leading = just_average['spread_winner'].iloc[0]

In [13]:
# Dynamic narrative from the data
state = f"{leading} leads the general election race nationally as of {today} by {spread} percentage points, according to the RealClearPolitics average of polls."

In [14]:
state

'Trump leads the general election race nationally as of August 2 by 1.2 percentage points, according to the RealClearPolitics average of polls.'