# Class for Scraping Last Saturday's Registrations

In [121]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from datetime import datetime, timedelta

class WebDataScraper:
    def __init__(self):
        self.saturdays_in_2024 = self.saturdays(2024)
        self.json_data = ""

    # Generate a list of all Saturdays in the year up to the current date.
    @staticmethod
    def saturdays(year : int):
        SAT = 5
        START_MONTH = 1
        START_DAY = 1
        start_date = datetime(year, START_MONTH, START_DAY) # starting on Jan 1
        start_day_of_the_week = start_date.weekday()
        days_until_saturday = SAT - start_day_of_the_week
        if days_until_saturday < 0:
            days_until_saturday += 7
        upcoming_sat = start_date + timedelta(days = days_until_saturday)
        
        saturdays = []
        saturdays.append(start_date.strftime('%m/%d/%Y')) # data starts at jan 1
        current_sat = upcoming_sat
        
        while current_sat < datetime.now():
            saturdays.append(current_sat.strftime('%m/%d/%Y'))
            current_sat += timedelta(weeks=1)

        return saturdays
    
    # Fetch data from the URL for the given Saturday and store the JSON content.
    def fetch_data(self, saturday_date : str):
        url = f'https://vt.ncsbe.gov/RegStat/Results/?date={saturday_date}'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        scripts = soup.find_all('script')

        self.json_data = ""  # Clear previous JSON data

        for script in scripts:
            if script.string and 'SetupGrid' in script.string:
                script_content = script.string
                start_index = script_content.find("var data = ")
                end_index = script_content.find("// initialize the igGrid control")
                if start_index != -1 and end_index != -1:
                    self.json_data = script_content[start_index + len("var data = "):end_index].strip()
                    break

        if self.json_data:
            self.json_data = self.json_data.rstrip(',')
        else:
            raise ValueError("JSON data not found.")
        
    # Parse the JSON data into a DataFrame and add a 'Week Ending' column.
    def parse_json(self, saturday_date : str):
        if not self.json_data:
            raise ValueError("No JSON data to parse.")

        data = json.loads(self.json_data)
        df = pd.DataFrame(data)

        # Drop the 'AppVersion' column
        if 'AppVersion' in df.columns:
            df = df.drop(columns=['AppVersion'])

        # Add the 'Date' column with the Saturday's date
        df['Week Ending'] = saturday_date


        # Capitalize the first letter of each county name
        df['CountyName'] = df['CountyName'].str.capitalize()
        
        return df
    
    # Fetch and parse data for all Saturdays in the year, returning a list of DataFrames.
    def sat_dataframes(self):
        dfs = []
        for sat in self.saturdays_in_2024:
            self.fetch_data(sat) # NO!!
            df = self.parse_json(sat)
            dfs.append(df)
        return dfs

In [122]:
# Create an instance of the WebDataScraper class
scraper = WebDataScraper()

# Fetch and process the data, returning a DataFrame
sat_dfs = scraper.sat_dataframes()

# Merge FIPS with scraped dataframe
df_fips = pd.read_csv('FIPS.csv')
df_fips['FIPS'] = df_fips['FIPS'].astype(str).str.zfill(3)

voter_registration_by_fips_dfs = []

for df, sat_date in zip(sat_dfs, scraper.saturdays_in_2024):
    merged_df = pd.merge(df, df_fips, left_on='CountyName', right_on='County', how='inner')
    
    # Drop the 'CountyName' column
    merged_df = merged_df.drop(columns=['CountyName'])

    # Capitalize the first letter of each county name
    merged_df['County'] = merged_df['County'].str.capitalize()

    # Reorder columns to put 'County' and 'FIPS_Code' at the front
    columns_order = ['County', 'FIPS'] + [col for col in merged_df.columns if col not in ['County', 'FIPS']]
    merged_df = merged_df[columns_order]
    
    voter_registration_by_fips_dfs.append(merged_df)

# Concatenate all dataframes into one
combined_df = pd.concat(voter_registration_by_fips_dfs, ignore_index=True)
combined_df.to_csv('combined_data.csv', index=False)




In [120]:
import pandas as pd
from datetime import datetime
import plotly.express as px

# Assuming 'voter_registration_by_fips_dfs' is a list of DataFrames with registration data
def calculate_weekly_changes(dfs):
    all_dfs = pd.concat(dfs)
    all_dfs['Week Ending'] = pd.to_datetime(all_dfs['Week Ending'], format='%m/%d/%Y')

    # Ensure data is sorted by 'County' and 'Week Ending'
    all_dfs = all_dfs.sort_values(by=['County', 'Week Ending'])

    # Calculate the weekly change for each county
    all_dfs['Weekly Change'] = all_dfs.groupby('County')['Total'].diff().fillna(0)
    
    return all_dfs


In [108]:
def plot_weekly_changes(df):
    fig = px.line(df,
                  x='Week Ending',
                  y='Weekly Change',
                  color='County',
                  title='Weekly Changes in Voter Registrations',
                  labels={'Weekly Change': 'Change in Registrations'},
                  line_shape='linear')

    fig.update_layout(xaxis_title='Date', yaxis_title='Change in Registrations')
    fig.update_traces(mode='lines+markers')

    fig.show()

df_with_changes = calculate_weekly_changes(voter_registration_by_fips_dfs)
df_with_changes[df_with_changes['County'] == 'Alamance']
df_with_changes.head(30)
# plot_weekly_changes(df_with_changes)


Unnamed: 0,County,FIPS,Democrats,Republicans,Libertarians,Green,NoLabels,Constitution,JusticeForAll,WeThePeople,...,Multiracial,Undesignated,Other,Hispanic,Male,Female,UnDisclosedGender,Total,Week Ending,Weekly Change
0,Alamance,1,37085,36406,697,24,33,0,0,0,...,671,9579,4467,5873,47037,57081,8680,112798,2024-01-01,0.0
0,Alamance,1,37115,36495,697,24,36,0,0,0,...,670,9571,4501,5894,47142,57217,8680,113039,2024-01-13,241.0
0,Alamance,1,37217,36669,704,24,40,0,0,0,...,670,9575,4637,6010,47468,57529,8690,113687,2024-01-20,648.0
0,Alamance,1,37222,36709,702,24,43,0,0,0,...,669,9578,4672,6035,47551,57607,8712,113870,2024-01-27,183.0
0,Alamance,1,37215,36744,703,24,49,0,0,0,...,669,9583,4687,6048,47591,57650,8724,113965,2024-02-03,95.0
0,Alamance,1,37104,36697,701,25,51,0,0,0,...,669,9589,4704,6060,47517,57577,8736,113830,2024-02-10,-135.0
0,Alamance,1,37095,36697,700,25,51,0,0,0,...,670,9585,4712,6062,47517,57570,8739,113826,2024-02-17,-4.0
0,Alamance,1,37084,36682,700,25,51,0,0,0,...,671,9575,4712,6062,47500,57555,8731,113786,2024-02-24,-40.0
0,Alamance,1,37082,36672,699,25,51,0,0,0,...,672,9575,4711,6061,47496,57526,8731,113753,2024-03-02,-33.0
0,Alamance,1,37074,36673,698,25,51,0,0,0,...,672,9575,4711,6059,47490,57528,8731,113749,2024-03-09,-4.0


In [74]:
for df in voter_registration_by_fips_dfs:
    # print(type(df))
    print(df[['County', 'Total', 'Week Ending']])
    print('')  

       County   Total Week Ending
0    Alamance  112798  01/01/2024
1   Alexander   24990  01/01/2024
2   Alleghany    7907  01/01/2024
3       Anson   16127  01/01/2024
4        Ashe   20088  01/01/2024
..        ...     ...         ...
94      Wayne   74143  01/01/2024
95     Wilkes   43847  01/01/2024
96     Wilson   54840  01/01/2024
97     Yadkin   24761  01/01/2024
98     Yancey   14226  01/01/2024

[99 rows x 3 columns]

       County   Total Week Ending
0    Alamance  113039  01/13/2024
1   Alexander   25207  01/13/2024
2   Alleghany    7940  01/13/2024
3       Anson   16218  01/13/2024
4        Ashe   20220  01/13/2024
..        ...     ...         ...
94      Wayne   74626  01/13/2024
95     Wilkes   44099  01/13/2024
96     Wilson   55220  01/13/2024
97     Yadkin   24901  01/13/2024
98     Yancey   14317  01/13/2024

[99 rows x 3 columns]

       County   Total Week Ending
0    Alamance  113687  01/20/2024
1   Alexander   25211  01/20/2024
2   Alleghany    7945  01/20/2024


In [92]:
import pandas as pd

# Combine all weekly DataFrames into one
combined_weekly_df = pd.concat(voter_registration_by_fips_dfs, ignore_index=True)

# Ensure 'Week Ending' is in datetime format
combined_weekly_df['Week Ending'] = pd.to_datetime(combined_weekly_df['Week Ending'], format='%m/%d/%Y')

# Extract Year and Month from 'Week Ending'
combined_weekly_df['Year'] = combined_weekly_df['Week Ending'].dt.year
combined_weekly_df['Month'] = combined_weekly_df['Week Ending'].dt.month

# Group by County, Year, and Month, and sum the total registrations
monthly_totals_df = combined_weekly_df.groupby(['County', 'Year', 'Month']).agg({'Total': 'sum'}).reset_index()

# Sort by County, Year, and Month to ensure chronological order
monthly_totals_df.sort_values(by=['County', 'Year', 'Month'], inplace=True)

# Calculate the monthly change by comparing with the previous month
monthly_totals_df['Monthly Change'] = monthly_totals_df.groupby('County')['Total'].diff().fillna(0)

# The change for January (the first month) should be set to 0
monthly_totals_df.loc[monthly_totals_df['Month'] == 1, 'Monthly Change'] = 0

# Optionally, save the monthly changes to a CSV file
monthly_totals_df.to_csv('monthly_changes.csv', index=False)

# Display the first few rows of the monthly changes DataFrame
print(monthly_totals_df.head(30))


       County  Year  Month   Total  Monthly Change
0    Alamance  2024      1  453394             0.0
1    Alamance  2024      2  455407          2013.0
2    Alamance  2024      3  568740        113333.0
3    Alamance  2024      4  454302       -114438.0
4    Alamance  2024      5  453574          -728.0
5    Alamance  2024      6  573806        120232.0
6    Alamance  2024      7  460385       -113421.0
7    Alamance  2024      8  230451       -229934.0
8   Alexander  2024      1  100620             0.0
9   Alexander  2024      2  100865           245.0
10  Alexander  2024      3  125952         25087.0
11  Alexander  2024      4  100540        -25412.0
12  Alexander  2024      5  100435          -105.0
13  Alexander  2024      6  126715         26280.0
14  Alexander  2024      7  101287        -25428.0
15  Alexander  2024      8   50709        -50578.0
16  Alleghany  2024      1   31742             0.0
17  Alleghany  2024      2   31862           120.0
18  Alleghany  2024      3   39