# **EDA of US Election Turnout**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Load the dataset**

In [None]:
df = pd.read_csv("/kaggle/input/2020-us-general-election-turnout-rates/2020 November General Election - Turnout Rates.csv")
df_usa = df[df['State']=='United States']
print (df.shape)
df.head()

## **% of NA values in each column**

In [None]:
df.isna().sum()/df.shape[0]*100

## **Pre-processing of the data**

In [None]:
df.drop(columns=['Source','Official/Unofficial','Vote for Highest Office (President)','Overseas Eligible','State Abv'],
        inplace=True)
df.drop(index=[0], inplace=True)

In [None]:
list_count = ['Total Ballots Counted (Estimate)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)',
            'Prison', 'Probation', 'Parole', 'Total Ineligible Felon']
list_perc = ['VEP Turnout Rate', '% Non-citizen']
for col in list_count:
    df[col] = df[col].str.replace(',', '')
for col in list_perc:
    df[col] = df[col].str.replace('%', '')
    
df[list_count] = df[list_count].astype('int')
df[list_perc] = df[list_perc].astype('float')

# **Visualisation**

### **Heatmap (correlation) among various features**

In [None]:
df_temp = df.set_index('State')
corr = df_temp.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,10))
sns.heatmap(corr, mask=mask, center=0, annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

### **Top 15 US states with maximum VEP Turnout Rate**

In [None]:
df_temp = df.set_index('State')
sns.set(style='darkgrid')
plt.figure(figsize=(12,6))
sns.barplot(x='State', y='VEP Turnout Rate', data=df, 
            order=df_temp['VEP Turnout Rate'].sort_values(ascending=False).index[:15])
plt.xticks(rotation=90)
plt.xlabel('State', weight='bold')
plt.ylabel('VEP Turnout Rate (%)', weight='bold')
plt.show()

### **Top 15 US states showing variation in VAP and VEP**

In [None]:
df_temp = df.set_index('State')
sns.set(style='darkgrid')
plt.figure(figsize=(12,6))
sns.barplot(x='State', y='Voting-Age Population (VAP)', data=df, label = 'Voting Age Population (VAP)',
            order=df_temp['Voting-Age Population (VAP)'].sort_values(ascending=False).index[:15], color='orange')
sns.barplot(x='State', y='Voting-Eligible Population (VEP)', data=df, label = 'Voting Eligible Population (VEP)',
            order=df_temp['Voting-Age Population (VAP)'].sort_values(ascending=False).index[:15], color='green')
plt.xticks(rotation=90)
plt.xlabel('State', weight='bold')
plt.ylabel('Population', weight='bold')
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)
plt.legend(shadow=True, facecolor='lightyellow')
plt.show()

### **Top 10 US states with population in different scenarios**

In [None]:
df_temp = df.set_index('State')
fig, ax = plt.subplots(2, 2, figsize=(16,16))
for col, ax in zip(['Prison','Probation','Parole','Total Ineligible Felon'], ax.flat):
    dict_ = df_temp[col].sort_values(ascending=False).head(10).to_dict()
    ax.pie(x=list(dict_.values()), labels=dict_.keys(), shadow=True, startangle=90)
    ax.set_title(col, weight='bold')
plt.show()

### **Top 10 US states with highest VEP in different scenarios**

In [None]:
df_temp = df.set_index('State')
df_temp = df_temp.sort_values('Voting-Eligible Population (VEP)', ascending=False).head(10)
sns.set(style='darkgrid')
df_temp[['Prison','Probation','Parole','Total Ineligible Felon']].plot(kind="bar", figsize=(14,8), width=0.7)
plt.xticks(rotation=90)
plt.xlabel('State', weight='bold')
plt.ylabel('Population', weight='bold')
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)
plt.legend(shadow=True, facecolor='lightyellow')
plt.show()

# **Feel free to Upvote and provide Feedback**