## Import Modules

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import models, layers
import matplotlib.pyplot as plt

## Get the Paths

In [None]:
#path to the folder from your drive (this will vary for your specific folder setup)
path = "../input/2016-us-election/"
os.listdir(path)

## Load all the CSVS

In [None]:
county_facts = pd.read_csv(path + "county_facts.csv")
county_dict =  pd.read_csv(path + "county_facts_dictionary.csv")
primaries = pd.read_csv(path + "primary_results.csv")
SHAPEPATH = path + "county_shapefiles/"

In [None]:
SHAPEPATH

In [None]:
county_facts.columns

In [None]:
county_facts.head()

# Voting Analysis

## Processing

In [None]:
counties = county_facts.sort_values("area_name")

In [None]:
counties = counties[counties['state_abbreviation'].notna()]

In [None]:
counties["area_name"] = counties.area_name.str.rsplit(" ", 1).str[:-1]
counties["area_name"] = counties['area_name'].apply(''.join)

In [None]:
NaN = np.nan
counties["party"] = NaN
counties["candidate"] = NaN
counties["votes"] = NaN
counties["fraction"] = NaN

In [None]:
primaries.columns

## Create new Dataframes with Winners

In [None]:
dem_winners = pd.DataFrame(columns = ['State', 'State_Abbreviation', 'County_Name', 'fips', 'party', 'Winning Candidate', 'Votes', 'Percentage'])
rep_winners = pd.DataFrame(columns = ['State', 'State_Abbreviation', 'County_Name', 'fips', 'party', 'Winning Candidate', 'Votes', 'Percentage'])

for c in primaries.county.unique():
    temp = primaries.loc[primaries.county == c]
    
    val = temp.values
    dem_index = 0
    rep_index = 0
    
    max_dem = 0
    max_rep = 0
    for i, item in enumerate(val):
        if item[4] == "Democrat":
            if item[6] > max_dem:
                max_dem = item[6]
                dem_index = i
        elif item[4] == "Republican":
            if item[6] > max_rep:
                max_rep = item[6]
                rep_index = i
                
    selected = val[dem_index]
    rep_selected = val[rep_index]
    
    temp_df = pd.DataFrame.from_dict(data={'State': [selected[0]], 'State_Abbreviation': [selected[1]], 'County_Name': [selected[2]],'fips': [selected[3]], 'party': [selected[4]], 'Winning Candidate': [selected[5]], 'Votes': [selected[6]], 'Percentage': [selected[7]]}) 
    dem_winners = pd.concat([dem_winners, temp_df])
    temp_df = pd.DataFrame.from_dict(data={'State': [rep_selected[0]], 'State_Abbreviation': [rep_selected[1]], 'County_Name': [rep_selected[2]],'fips': [rep_selected[3]], 'party': [rep_selected[4]], 'Winning Candidate': [rep_selected[5]], 'Votes': [rep_selected[6]], 'Percentage': [rep_selected[7]]}) 
    rep_winners = pd.concat([rep_winners, temp_df])

Bar plot with democrat vs republican by 10% bins

In [None]:
dem_winners = dem_winners.reset_index()
rep_winners = rep_winners.reset_index()

In [None]:
dem_winners["party"] = dem_winners["party"].astype(str)
rep_winners["party"] = rep_winners["party"].astype(str)

In [None]:
rep_winners.dtypes

## Start defining bins and values for the graph

In [None]:
bins = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

dem_vals = [0 for i in range(len(bins))] 
dem_p = dem_winners['Percentage'].values

for item in dem_p:
    
    done = False
    for i, val in enumerate(bins):
        if item <= val and not done:           
            dem_vals[i] += 1
            done = True

In [None]:
rep_vals = [0 for i in range(len(bins))] 
rep_p = rep_winners['Percentage'].values

for item in rep_p:
    done = False
    for i, val in enumerate(bins):
        if item <= val and not done:           
            rep_vals[i] += 1
            done = True

In [None]:
dem_vals

In [None]:
rep_vals

In [None]:
bins

In [None]:
x = np.arange(len(bins))

## Make a Plot for the Percentage of the vote required to win a county for both democrats and republicans

Typically, Republicans require less of a percentage to win a county than republicans. This could be attributed to many causes. Perhaps the republican vote is more evenly split

In [None]:
width = 0.2

plt.style.use("dark_background")

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, dem_vals, bins, label='Democrat', color="#0048ff")
rects2 = ax.bar(x + width/2, rep_vals, bins, label='Republican', color="#eb090d")

labels = ["0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"]

ax.set_xlabel('Vote Percentages to Win County')
ax.set_ylabel('Number of Counties')
ax.set_title('Democratic and Republican Counties vs Vote Percentage to win Respective Primaries')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_yticks([200, 400, 600, 800, 1000, 1200])
ax.legend()

for rect in rects1:
    height = rect.get_height()
    ax.annotate('{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(-6, 4),  # 3 points vertical offset
                textcoords="offset points", color="#FFFFFF",
                ha='center', va='bottom')


for rect in rects2:
    height = rect.get_height()
    ax.annotate('{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(-6, 4),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', color="#FFFFFF")

fig.tight_layout()

ax.spines['bottom'].set_color('#dddddd')
ax.spines['top'].set_color('#dddddd') 
ax.spines['right'].set_color('#dddddd')
ax.spines['left'].set_color('#dddddd')


plt.show()

Looking at the graph, we can clearly see that there is a correlation with democrats needing a higher vote percentage to win the election. This may be because Democrats had a lower amount of candidates than Republicans.

In [None]:
d = primaries.loc[primaries.party == "Democrat"]
r = primaries.loc[primaries.party == "Republican"]

Average the vote percentages for each candidate in county, then compare dem vs rep

In [None]:
r_arr = []
r_sum = 0
for c in r.candidate.unique():
    s = r.loc[r.candidate == c].votes.sum()
    r_arr.append(s)
    r_sum += s

In [None]:
d_arr = []
d_sum = 0
for c in d.candidate.unique():
    s = d.loc[d.candidate == c].votes.sum()
    d_arr.append(s)
    d_sum += s

In [None]:
r_arr = [100*val/r_sum for val in r_arr]
d_arr = [100*val/d_sum for val in d_arr]

In [None]:
reps = list(r.candidate.unique())
dems = list(d.candidate.unique())

In [None]:
rep_combined = []
for i, item in enumerate(r_arr):
    rep_combined.append([item, reps[i]])
    
rep_combined

In [None]:
dem_combined = []
for i, item in enumerate(d_arr):
    dem_combined.append([item, dems[i]])
    
dem_combined

In [None]:
rep_filtered = []
other = 0
for i, item in enumerate(rep_combined):
    if item[0] < 1.0:
        other += item[0]
    else:
        rep_filtered.append(item)
rep_filtered.append([other, "Other"])
rep_filtered

In [None]:
r_vals = [i[0] for i in rep_filtered]
r_candidates = [i[1] for i in rep_filtered]

In [None]:
dem_vals = [i[0] for i in dem_combined]
dem_candidates = [i[1] for i in dem_combined]

In [None]:
explode = []
for i in r_vals:
    if i < 10:
        explode.append(0.2)
    elif i > 30:
        explode.append(0.075)
    else:
        explode.append(0.05)

In [None]:
import matplotlib.pyplot as plt

plt.style.use("dark_background")

labels = r_candidates
sizes = r_vals

explode[-1] = 0.05

fig1, ax1 = plt.subplots(figsize=(10, 10))
ax1.pie(sizes, explode=explode, autopct='%1.1f%%', shadow=True, startangle=90, textprops={"backgroundcolor": "black", "color": "white"}, colors=["red", "green", "blue",  "purple", "#615643", "cyan"])
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.legend(labels)

plt.title("Republican Vote Split")

plt.show()


This graph highlights the vote split of republicans. As we can see, it is relatively evenly distributed around the candidates, with only a few outliers.

In [None]:
explode = []
for i in dem_vals:
    if i < 1:
        explode.append(0.2)
    else:
        explode.append(0.05)

In [None]:
import matplotlib.pyplot as plt

plt.style.use("dark_background")

labels = dem_candidates
sizes = dem_vals

explode[-1] = 0.05

fig1, ax1 = plt.subplots(figsize=(10, 10))
ax1.pie(sizes, explode=explode, autopct='%0.3f%%', shadow=True, startangle=90, textprops={"backgroundcolor": "black", "color": "white"}, colors=["red", "green", "blue",  "purple", "cyan"])
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.legend(labels)

plt.title("Democratic Vote Split")

plt.show()


Here, the democrat vote split is displayed. We can see how 2 candidates take the majority of the vote.

As predicted, the republicans had more candidates and the votes were more evenly distributed among them. Therefore, there was a smaller raw margin required to win. The democrats only had 2 major contenders, so while the vote was evenly split, the winning candidate required more voted.

# Ethnicity vs Voter Turnout and percentages
Arnav

In [None]:
county_dict

In [None]:
county_filtered = county_facts.drop_duplicates(subset="area_name")

In [None]:
county_filtered.area_name = county_filtered.area_name.str.rsplit(" ", 1).str[0]

In [None]:
merged = primaries.merge(right=county_filtered, left_on="county", right_on="area_name")

In [None]:
merged.columns

In [None]:
r = merged.loc[merged.party == "Republican"]
d = merged.loc[merged.party == "Democrat"]

In [None]:
plt.scatter(county_facts.RHI125214, county_facts.POP060210)
plt.title("Percentage of White Population vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

No correlation in this graph.

In [None]:
plt.scatter(county_facts.RHI225214, county_facts.POP060210)
plt.title("Percentage of Arican American Population vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

No correlation in this graph.

In [None]:
plt.scatter(county_facts.RHI325214, county_facts.POP060210)
plt.title("Percentage of American Indian Population vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

Correlation between lower percentage of American Indians and higher population density. Native Americans may stay in areas of high population

In [None]:
plt.scatter(county_facts.RHI425214, county_facts.POP060210)
plt.title("Percentage of Asian Population vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

Correlation between More Asians incounty and lower overall percantage. Asians may tend to stay in larger, more populated counties

In [None]:
plt.scatter(county_facts.RHI525214, county_facts.POP060210)
plt.title("Percentage of Native Hawaiian vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

Strong correlation between higher population of Native Hawaiians low percentage overall. Likely stay in highly populated areas

In [None]:
plt.scatter(county_facts.RHI725214, county_facts.POP060210)
plt.title("Percentage of Hispanic vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

No Correlation

In [None]:
plt.scatter(county_facts.RHI825214, county_facts.POP060210)
plt.title("Percentage of White Alone (Nonhispanic/Latino) vs Population per Square Mile")
plt.xlabel("Percentage")
plt.ylabel("Population")
plt.show()

No Correlation

# Foreign Born Persons, Persons Per Household
Arnav

In [None]:
county_dict

In [None]:
#foreign born persons with bin percentages vs candidates (bar plot with bins)

In [None]:
merged.POP645213

In [None]:
#iterate through each candidate and create an array with the percent of votes they got vs the population bins. get the avg for each bin and plot

In [None]:
#bin into 10 arrs selected by foreign born percentages (10% intervals)
#average candidate vote percentage in those array
#plot

In [None]:
merged.columns

In [None]:
merged.POP645213.describe()

In [None]:
type(merged)

In [None]:
trumpet = merged[merged.candidate == "Donald Trump"]
trump_arr = []

for i in range(1, 7):
  temp = trumpet[(trumpet.POP645213 < i*10) & (trumpet.POP645213 > (i-1)*10)]
  s = temp.POP645213.sum()
  trump_arr.append(float(s))
  temp = None

trump_arr

In [None]:
len(x), len(trump_arr)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('dark_background')

x = ['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%']

x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, trump_arr, color='cyan')
plt.xlabel("Percent of Foreign Born People")
plt.ylabel("Number of Counties")
plt.title("Counties vs Foreign Born People")

plt.xticks(x_pos, x)


plt.show()

Majority of counties have very little foreign born people. Maximum % make up is 60% and the majority seems to be in the lower 0-10% range

# Median Household Income vs Candidate Performance


Setting up data and bins

In [None]:
NaN = np.nan
# county_facts[['INC110213', 'area_name'] ]
# notNumber = county_facts['state_abbreviation'][0]
county_list = county_facts[county_facts['state_abbreviation'].notnull()]
county_name_list = county_list['area_name'].values.tolist()
county_inc_list = county_list['INC110213'].values.tolist()
county_inc_data = []
for count in range(len(county_name_list)):
    county_inc_data.append((county_name_list[count], county_inc_list[count]))

max_inc = 0

for i in county_inc_data:
    if i[1] > max_inc:
        max_inc = i[1]

min_inc = max_inc

for i in county_inc_data:
    if i[1] < min_inc:
        min_inc = i[1]
        
print(min_inc, max_inc)

bins = [30000, 45000, 60000, 75000, 90000, 105000, 120000, 135000]

Formatting candidate performance data

In [None]:
c_prim_data = []
cand_list = primaries['candidate'].values.tolist()
votes_list = primaries['votes'].values.tolist()
c_votes_list = primaries['county'].values.tolist()
for i in range(len(cand_list)):
    c_prim_data.append((cand_list[i], votes_list[i], c_votes_list[i]))

best_cand = []

for c in c_prim_data:
    temp = []
    for x in c_prim_data:
        if x[2] == c[2] and x[0] != c[0]:
            temp.append(x)
    max_votes = 0
    total = 0
    perf_cand = None
    for cand in temp:
        if cand[1] > max_votes:
            max_votes = cand[1]
    for cand in temp:
        total = total + cand[1]
        if cand[1] == max_votes:
            perf_cand = cand
    best_cand.append(perf_cand)
best_cand = list(dict.fromkeys(best_cand))

for i, cand in enumerate(best_cand):
    if cand[2] == best_cand[i+1][2]:
        if cand[1] > best_cand[i+1][1]:
            best_cand.remove(best_cand[i+1])
        elif cand[1] < best_cand[i+1][1]:
            best_cand.remove(cand)

Organizing performance data into bins according to income 

In [None]:
bin1 = []
bin2 = []
bin3 = []
bin4 = []
bin5 = []
bin6 = []
bin7 = []
bin8 = []
for x in county_inc_data:
    str = x[0].replace(' County', '')
    for y in best_cand:
        if y[2] == str:
            x = [y[0], x[1], y[2]]
    if isinstance(x, list): 
        if x[1] <= 30000:
            bin1.append(x)
        elif x[1] <= 45000:
            bin2.append(x)
        elif x[1] <= 60000:
            bin3.append(x)
        elif x[1] <= 75000:
            bin4.append(x)
        elif x[1] <= 90000:
            bin5.append(x)
        elif x[1] <= 105000:
            bin6.append(x)
        elif x[1] <= 120000:
            bin7.append(x)
        elif x[1] <= 135000:
            bin8.append(x)
def get_c_data(county_name):
    for c in best_cand:
        if county_name == c[2]:
            return c
org_counties = [bin1, bin2, bin3, bin4, bin5, bin6, bin7, bin8]
inc_labels = ['Up to $30,000', '$30,000 - $45,000', '$45,000 - $60,000', '$60,000 - $75,000', '$75,000 - $90,000', '$90,000 - $105,000', '$105,000 - $120,000', '$120,000 - $135,000']
cand_list = []
for x in best_cand:
    cand_list.append(x[0])
    
cand_list = list(dict.fromkeys(cand_list))

In [None]:
for i, bracket in enumerate(org_counties):
    for b, county in enumerate(bracket):
        org_counties[i][b] = county[0]

In [None]:
def CountFrequency(my_list): 
   count = {} 
   for i in my_list: 
    count[i] = count.get(i, 0) + 1
   return count 

for i, x in enumerate(org_counties):
    org_counties[i] = CountFrequency(x)
    

Setting up data for graphs

In [None]:
top_cands = []
all_cands = [[],[],[],[],[],[],[],[]]
for b, dict in enumerate(org_counties):
    max = 0
    total = 0
    for i, item in list(dict.items()):
        total = total + item
        if item > max:
            max = item
    for item in list(dict.items()):
        if item[1] == max:
            top_cands.append((item[0], item[1]/total*100))
        all_cands[b].append((item[0], item[1]/total*100))
cand_list = [[],[],[],[],[],[],[],[]]
chunks = [[],[],[],[],[],[],[],[]]
for i, x in enumerate(all_cands):
    for y in x:
        cand_list[i].append(y[0])
        chunks[i].append(y[1])

Creating graphs

In [None]:
import matplotlib.pyplot as plt


explode = [[],[],[],[],[],[],[],[]]
for b, i in enumerate(chunks):
    for x in i:
        if x < 1:
            explode[b].append(0.2)
        else:
            explode[b].append(0.05)
for i in range(len(all_cands)):
    plt.style.use("dark_background")
    
    
    fig1, ax1 = plt.subplots(figsize=(10, 10))
    ax1.pie(chunks[i], autopct='%0.3f%%', explode=explode[i], shadow=True, startangle=90, textprops={"backgroundcolor": "black", "color": "white"}, colors=["red", "green", "blue",  "purple", "cyan", "yellow"])
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.legend(cand_list[i])

    plt.title("County Average Income: " + inc_labels[i])

    plt.show()