In [None]:
# Cell 1: Import Libraries and Load Initial Data

import pandas as pd
import matplotlib.pyplot as plt

# Load polls and elections data
polls = pd.read_csv('data/house_polls_merged.csv')
elections = pd.read_csv('data/1976-2022-house.csv')

# Display initial information
print("Polls Data Columns:", polls.columns.tolist())
print("Elections Data Columns:", elections.columns.tolist())


In [None]:
# Cell 2: Load and Prepare Elections Data (2018-2022)

# Load elections data for 2018-2022
elections_2018_2022 = pd.read_csv('data/2018-2022-house.csv')

# Reset index for clean integer indexing
elections_2018_2022.reset_index(drop=True, inplace=True)

# Group elections by year, state, and district
election_groupby = elections_2018_2022.groupby(['year', 'state', 'district'])

# Display group information
print(f"Total Groups: {len(election_groupby)}")


In [None]:
print(set(elections_2018_2022['stage']))

In [None]:
# Cell 4: Process Elections Data into Structured DataFrame

# Initialize a list to collect election rows
election_rows = []

# Iterate through each group to extract top two candidates
for group_key, group_df in election_groupby:
    # Ensure 'totalvotes' column exists
    # print(group_df)
    if 'totalvotes' not in group_df.columns:
        print(f"Missing 'totalvotes' in group: {group_key}")
        continue
    
    # Sort candidates by total votes in descending order
    sorted_group = group_df.sort_values(by='candidatevotes', ascending=False)
    
    # Extract top candidate (c1)
    c1 = sorted_group.iloc[0]
    c1_name = c1['candidate']
    c1_party = c1['party']
    c1_votes = c1['candidatevotes']
    
    # Extract second candidate (c2) if exists
    if len(sorted_group) > 1:
        c2 = sorted_group.iloc[1]
        c2_name = c2['candidate']
        c2_party = c2['party']
        c2_votes = c2['candidatevotes']
    else:
        c2_name, c2_party, c2_votes = None, None, None
    
    # Determine winner
    winner = c1_name
    winner_party = c1_party
    
    # Append the row to the list
    election_rows.append([
        group_key[0],  # year
        group_key[1],  # state
        group_key[2],  # district
        c1_name,
        c2_name,
        c1_party,
        c2_party,
        c1_votes,
        c2_votes,
        winner,
        winner_party
    ])

# Create the elections DataFrame from the collected rows
election_df = pd.DataFrame(
    election_rows,
    columns=[
        'year', 'state', 'district',
        'c1', 'c2', 'c1_party', 'c2_party',
        'c1_votes', 'c2_votes',
        'winner', 'winner_party'
    ]
)

# Display the first few rows
print("Elections DataFrame Preview:")
print(election_df.head())

# Save the processed elections data to CSV
election_df.to_csv('data/elections-2018-2022.csv', index=False)
print("Processed elections data saved to 'data/elections-2018-2022.csv'")

In [None]:
election_df.to_csv('data/elections-2018-2022.csv', index=False)

In [None]:
polls = pd.read_csv('data/house_polls_merged.csv')
polls['candidate_name'] = polls['candidate_name'].apply(lambda x: x.upper())
polls['state'] = polls['state'].apply(lambda x: x.upper())

# print(polls.columns)
groupby = polls.groupby(['state', 'seat_number', 'cycle', 'poll_id'])

groups = list(groupby)


# atts that good be good (beyond obvious):
# poll_id, pollster_id, sponsor_id, pollster_rating_id, pollscore, methodology, transparency score, sample_size, partisan
# sample_size

poll_df = pd.DataFrame(columns=['year', 'state', 'district', 'c1', 'c2', 'c1_party', 'c2_party', 'c1_pct', 'c2_pct'])

elections_done = []
# print(groups[0][1])
for group in groups:
    if(list(group[0][:3]) not in elections_done):
      df = group[1]
      c1_idx = df['pct'].idxmax()
      c1 = df['candidate_name'][c1_idx]
      c1_party = df['party'][c1_idx]
      c1_pct = df['pct'][c1_idx]

      df = df.drop(index=c1_idx)
      try:
          c2_idx = df['pct'].idxmax()
          c2 = df['candidate_name'][c2_idx]
          c2_party = df['party'][c2_idx]
          c2_pct = df['pct'][c2_idx]
      except:
          c2, c2_party, c2_pct = None, None, None

    
      new_row = [group[0][2], group[0][0], group[0][1], c1, c2, c1_party, c2_party, c1_pct, c2_pct]
      poll_df.loc[len(poll_df)] = new_row
      elections_done.append(list(group[0][:3]))
    else:
        print('here')

print(elections_done)
print(len(poll_df))

In [None]:
print(poll_df.columns)
print(election_df.columns)

print(len(poll_df.values))

In [None]:
# joined_df = election_df.join(poll_df, on=['year', 'state', 'district'], how='outer')

print(len(election_df))
print(len(poll_df))
# print(polls.columns)
polls_key = polls[['state', 'seat_number', 'cycle']]

# print(len(set(zip(polls['state'], polls['seat_number'], polls['cycle']))))
joined_df = pd.merge(election_df, poll_df, on=['year', 'state', 'district'], how = 'inner')

print(len(joined_df.values))

# joined_df[['winner_party']] = joined_df[['winner_party']].apply(lambda col:pd.Categorical(col).codes)

# joined_df['winner_party'].map("DEMOCRAT", 0)
# joined_df['winner_party'].map("REPUBLICAN", 1)

print(len(set(zip(joined_df['state'], joined_df['year'], joined_df['district']))))

joined_df.to_csv('data/joined_polls_elections.csv', index=False)

joined_df['winner_party'].value_counts().plot(kind='bar')