In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress a specific warning
warnings.simplefilter("ignore", category=FutureWarning)

In [2]:
base_url = "https://www.realclearpolling.com/polls/"
sample_url = "president/republican-primary/2024/national"
sample_url = "president/general/2024/trump-vs-biden"
url = base_url + sample_url

In [3]:
# Create a webdriver instance and get the page source
driver = webdriver.Chrome()
driver.get(url)

# Allow time for dynamic content to load (you may need to adjust the sleep duration)
time.sleep(5)

# Get the page source after dynamic content has loaded
html_content = driver.page_source

# Close the webdriver
driver.quit()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html5lib')

# Now you can extract the table data using the same approach as before
table = soup.find_all('table')

if len(table)==2:
    table = table[1]
else:
    table = table[0]

table_data = []
for row in table.find_all('tr'):
    row_data = [cell.text.strip() for cell in row.find_all(['td','th'])]
    table_data.append(row_data)

In [12]:
#Convert table data to a dataframe
#We remove the first row since this is RCP summary data that we don't want to use

current_year = str(datetime.now().year)
prev_year = str(datetime.now().year-1)


df = pd.DataFrame(table_data[2:150], columns=table_data[0])
df["Difference"] = df["Trump (R)"].astype(float) - df["Biden (D)"].astype(float)
df["Type of Voter"] = df["sample"].str.split(" ").str[1]
df["Sample Size"] = df["sample"].str.split(" ").str[0]
#We need to add the year to the date to make it a datetime object
#We need to make sure the year we add is the year the poll was taken, not necessarily the current year
df["End Date"] = df["date"].str.split("-").str[1] 
df["Poll Month"] = df["date"].str.split("-").str[1].str.split("/").str[0]
df["Poll Month"] = df["Poll Month"].astype(int)
first_dec = df[df["Poll Month"]==12].index[0]
df["Year"] = [current_year]*first_dec + [prev_year]*(len(df)-first_dec)
df["End Date"] = df["End Date"] + "/" + df["Year"]
df["End Date"] = np.array(pd.to_datetime(df["End Date"], format="mixed"))
df

Unnamed: 0,pollster,date,sample,moe,Trump (R),Biden (D),spread,Difference,Type of Voter,Sample Size,End Date,Poll Month,Year
0,Reuters/Ipsos,1/22 - 1/24,1028 RV,—,43,38,Trump+5,5.0,RV,1028,2024-01-24,1,2024
1,Economist/YouGov,1/21 - 1/23,1497 RV,2.8,44,43,Trump+1,1.0,RV,1497,2024-01-23,1,2024
2,The Messenger/HarrisX,1/17 - 1/21,3034 RV,1.8,53,47,Trump+6,6.0,RV,3034,2024-01-21,1,2024
3,Morning Consult,1/19 - 1/21,6417 RV,1.0,45,40,Trump+5,5.0,RV,6417,2024-01-21,1,2024
4,Harvard-Harris,1/17 - 1/18,2346 RV,—,53,47,Trump+6,6.0,RV,2346,2024-01-18,1,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,Marquette,5/8 - 5/18,833 RV,4.1,52,47,Trump+5,5.0,RV,833,2023-05-18,5,2023
144,Yahoo News**,5/5 - 5/8,1060 RV,3.0,43,45,Biden+2,-2.0,RV,1060,2023-05-08,5,2023
145,ABC News/Wash Post,4/28 - 5/3,900 RV,4.0,45,39,Trump+6,6.0,RV,900,2023-05-03,5,2023
146,Economist/YouGov,4/29 - 5/2,1357 RV,3.0,46,46,Tie,0.0,RV,1357,2023-05-02,5,2023


In [13]:
import plotly.express as px
import pandas as pd

# Assuming df is your DataFrame with time series data
# Make sure the DataFrame has a column with datetime values

# Create a simple time series chart using plotly express
fig = px.scatter(df, x='End Date', y='Difference', trendline= "ols", title='Trump Biden Spread',
              labels={'value_column': 'Y-axis Label', 'datetime_column': 'X-axis Label'},
              color = 'Type of Voter',
                hover_data=['pollster', 'Sample Size'],
              template='seaborn', width=1000, height=600
              )
fig.show()

## LV vs. RVs Time Series Analysis

Also adjust for who is taking the poll/bias. Plus recency weighted bias. 
Predict spread based on LV vs. RV, date, and polling company. Work with ARIMA model, with dummy variables for polling companies.
Baseline prediction could be realclear spread at that time. And then we can compare whether the spread is better or worse than our prediction. 

Look for differences between Joe Biden Voters and Liberal Voters, as well as Trump Voters and Conservative voters. Has anything changed?

Difference between overall favorability, and job approval.

Compare likely to vote for, vs. past voting.

In [6]:
#Time Series Modeling (ARIMA)

#Spread = Poll Company + Type of Voter + Time + Sample Size?

