In [1]:
import pandas as pd

## All results

In [2]:
all_results = pd.read_csv(
    "data.csv", usecols=[0, 1, 2, 3, 4, 5]
)

In [3]:
all_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13703 entries, 0 to 13702
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   county_name            13703 non-null  object 
 1   precinct_name          13703 non-null  object 
 2   office_title           13703 non-null  object 
 3   candidate_ballot_name  11161 non-null  object 
 4   yes_votes              13703 non-null  int64  
 5   no_votes               3199 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 642.5+ KB


In [4]:
all_results.head()

Unnamed: 0,county_name,precinct_name,office_title,candidate_ballot_name,yes_votes,no_votes
0,Adair,SOUTHWEST 1,State Senator - District 18,Cindy O'Laughlin,283,
1,Adair,SOUTHEAST 2,State Senator - District 18,Cindy O'Laughlin,635,
2,Adair,SOUTHEAST 3,State Senator - District 18,Cindy O'Laughlin,194,
3,Adair,SOUTHEAST 4,State Senator - District 18,Cindy O'Laughlin,325,
4,Adair,NORTHEAST 5,State Senator - District 18,Cindy O'Laughlin,396,


The plan:

- Split legislative race results and constitutional amendment results into separate DataFrames
- How many distinct precincts are there in the constitutional amendment results?
- In the legislative results, add computed columns for office type and district
- Split the senate results from the house results
- Find any precincts in house results with results for multiple districts
- Find any precincts in senate results with results for multiple districts

In [5]:
all_results.office_title \
    .value_counts() \
    .reset_index() \
    .to_csv('office_title_counts.csv', index=False)

In [6]:
all_results \
    .groupby(['county_name', 'precinct_name']) \
    .office_title.count() \
    .reset_index() \
    .to_csv('office_titles_per_precinct.csv', index=False)

## Constitutional Amendment 1 results

- Filter to rows where `office_title` equals `'Constitutional Amendment No. 1'`
- Reset the index
- Drop the `office_title` and `candidate_ballot_name` columns
- Switch the data_type of the `no_votes` column to an integer

In [7]:
is_amend = all_results['office_title'] == 'Constitutional Amendment No. 1'

In [8]:
amend_results = all_results[is_amend] \
    .reset_index(drop=True) \
    .drop(columns=['office_title', 'candidate_ballot_name']) 

In [9]:
amend_results.no_votes = amend_results.no_votes.astype('int64')

In [10]:
amend_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   county_name    3199 non-null   object
 1   precinct_name  3199 non-null   object
 2   yes_votes      3199 non-null   int64 
 3   no_votes       3199 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 100.1+ KB


In [11]:
amend_results.head()

Unnamed: 0,county_name,precinct_name,yes_votes,no_votes
0,Adair,SOUTHWEST 1,342,159
1,Adair,SOUTHEAST 2,688,394
2,Adair,SOUTHEAST 3,374,156
3,Adair,SOUTHEAST 4,500,212
4,Adair,NORTHEAST 5,418,235


### How many voters voted for or against Amendment 1?

In [12]:
amend_voters_count = amend_results.yes_votes.sum() + amend_results.no_votes.sum()

In [13]:
amend_voters_count

2368729

### How many precincts with voters who voted for or against Amendment 1?

In [14]:
amend_precincts_count = len(amend_results)

In [15]:
amend_precincts_count

3199

## Senate results

- Filter to rows where th- Filter to rows where the `office_title` contains `'Senator'`
- Reset the index
- Add a column that contains the district number
- Drop the `office_title` column
- Drop the `no_votes` columne `office_title` contains `'Senator'`
- Add a column that contains the district number
- Drop the `office_title` column
- Drop the `no_votes` column

In [16]:
contains_senator = all_results.office_title.str.contains('Senator')

In [17]:
senate_results = all_results[contains_senator] \
    .reset_index(drop=True)

In [18]:
senate_results.iloc[1].office_title[len('State Senator - District '):]

'18'

In [19]:
senate_results['district'] = senate_results.office_title.str[len('State Senator - District '):]

In [20]:
senate_results.no_votes.sum()

0.0

In [23]:
senate_results = senate_results.drop(columns=['office_title', 'no_votes'])

In [24]:
senate_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3480 entries, 0 to 3479
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   county_name            3480 non-null   object
 1   precinct_name          3480 non-null   object
 2   candidate_ballot_name  3480 non-null   object
 3   yes_votes              3480 non-null   int64 
 4   district               3480 non-null   object
dtypes: int64(1), object(4)
memory usage: 136.1+ KB


In [25]:
senate_results.head()

Unnamed: 0,county_name,precinct_name,candidate_ballot_name,yes_votes,district
0,Adair,SOUTHWEST 1,Cindy O'Laughlin,283,18
1,Adair,SOUTHEAST 2,Cindy O'Laughlin,635,18
2,Adair,SOUTHEAST 3,Cindy O'Laughlin,194,18
3,Adair,SOUTHEAST 4,Cindy O'Laughlin,325,18
4,Adair,NORTHEAST 5,Cindy O'Laughlin,396,18


## House results

- Filter to rows where the `office_title` contains `'Senator'`
- Reset the index
- Add a column that contains the district number
- Drop the `office_title` column
- Drop the `no_votes` column

In [27]:
contains_rep = all_results['office_title'].str.contains('Representative')

In [28]:
rep_results = all_results[contains_rep].reset_index(drop=True)

In [29]:
rep_results.iloc[1].office_title[len('State Representative - District '):]

'3'

In [30]:
rep_results['district'] = rep_results.office_title.str[len('State Representative - District '):]

In [31]:
rep_results.no_votes.sum()

0.0

In [33]:
rep_results = rep_results.drop(columns=['office_title', 'no_votes'])

In [34]:
rep_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7024 entries, 0 to 7023
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   county_name            7024 non-null   object
 1   precinct_name          7024 non-null   object
 2   candidate_ballot_name  7024 non-null   object
 3   yes_votes              7024 non-null   int64 
 4   district               7024 non-null   object
dtypes: int64(1), object(4)
memory usage: 274.5+ KB


In [35]:
rep_results.head()

Unnamed: 0,county_name,precinct_name,candidate_ballot_name,yes_votes,district
0,Adair,SOUTHWEST 1,Danny Busick,258,3
1,Adair,SOUTHEAST 2,Danny Busick,620,3
2,Adair,SOUTHEAST 3,Danny Busick,187,3
3,Adair,SOUTHEAST 4,Danny Busick,306,3
4,Adair,NORTHEAST 5,Danny Busick,373,3


## Sanity check: The sum of rows in the sub-results should equal the number of rows in the full results

In [36]:
assert len(amend_results) + len(senate_results) + len(rep_results) == len(all_results)

In [37]:
amend_results.to_csv('amend_results.csv', index=False)

In [38]:
senate_results.to_csv('senate_results.csv', index=False)

In [39]:
rep_results.to_csv('rep_results.csv', index=False)

In [None]:
all_results.groupby(['', ])