# Supervised Learning Project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
votes = pd.read_csv('data/votes.csv')
results_12_16 = pd.read_csv('data/US_County_Level_Presidential_Results_12-16.csv')
county = pd.read_csv('data/county_facts.csv')
county_dict = pd.read_csv('data/county_facts_dictionary.csv')

## Part 1: EDA

In [3]:
display(votes.shape)
display(results_12_16.shape)
display(county.shape)
display(county_dict.shape)

(3112, 82)

(3141, 21)

(3195, 54)

(51, 2)

In [4]:
display(votes.head(2))

Unnamed: 0.1,Unnamed: 0,X,combined_fips,votes_dem_2016,votes_gop_2016,total_votes_2016,Clinton,Trump,diff_2016,per_point_diff_2016,...,AFN120207,BPS030214,LND110210,Density,Clinton_Obama,Trump_Romney,Trump_Prediction,Clinton_Prediction,Trump_Deviation,Clinton_Deviation
0,30,29,1001,5908,18110,24661,0.239569,0.734358,12202,-0.494789,...,88157,131,594.44,91.8,-0.026189,0.008021,0.620859,0.340493,-0.113499,0.100924
1,31,30,1003,18409,72780,94090,0.195653,0.773515,54371,-0.577862,...,436955,1384,1589.78,114.6,-0.020013,-0.000383,0.586749,0.359502,-0.186766,0.163849


In [5]:
votes = votes.drop('Unnamed: 0', axis=1)

In [6]:
votes.head(2)

Unnamed: 0,X,combined_fips,votes_dem_2016,votes_gop_2016,total_votes_2016,Clinton,Trump,diff_2016,per_point_diff_2016,state_abbr,...,AFN120207,BPS030214,LND110210,Density,Clinton_Obama,Trump_Romney,Trump_Prediction,Clinton_Prediction,Trump_Deviation,Clinton_Deviation
0,29,1001,5908,18110,24661,0.239569,0.734358,12202,-0.494789,AL,...,88157,131,594.44,91.8,-0.026189,0.008021,0.620859,0.340493,-0.113499,0.100924
1,30,1003,18409,72780,94090,0.195653,0.773515,54371,-0.577862,AL,...,436955,1384,1589.78,114.6,-0.020013,-0.000383,0.586749,0.359502,-0.186766,0.163849


In [7]:
county_dict.head()

Unnamed: 0,column_name,description
0,PST045214,"Population, 2014 estimate"
1,PST040210,"Population, 2010 (April 1) estimates base"
2,PST120214,"Population, percent change - April 1, 2010 to ..."
3,POP010210,"Population, 2010"
4,AGE135214,"Persons under 5 years, percent, 2014"


In [8]:
county_dict.shape

(51, 2)

In [9]:
display(county.head())
display(county.shape)

Unnamed: 0,fips,area_name,state_abbreviation,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210
0,0,United States,,318857056,308758105,3.3,308745538,6.2,23.1,14.5,...,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,1046363,3531905.43,87.4
1,1000,Alabama,,4849377,4780127,1.4,4779736,6.1,22.8,15.3,...,1.2,28.1,112858843,52252752,57344851,12364,6426342,13369,50645.33,94.4
2,1001,Autauga County,AL,55395,54571,1.5,54571,6.0,25.2,13.8,...,0.7,31.7,0,0,598175,12003,88157,131,594.44,91.8
3,1003,Baldwin County,AL,200111,182265,9.8,182265,5.6,22.2,18.7,...,1.3,27.3,1410273,0,2966489,17166,436955,1384,1589.78,114.6
4,1005,Barbour County,AL,26887,27457,-2.1,27457,5.7,21.2,16.5,...,0.0,27.0,0,0,188337,6334,0,8,884.88,31.0


(3195, 54)

### Rename columns in votes dataset

In [10]:
def clean_description(description):
    # Remove numbers
    # cleaned = ''.join([c for c in description if not c.isdigit()])
    # Remove special characters
    cleaned = ''.join([c for c in description if c.isalnum() or c.isspace()])
    # Remove the years 2014, 2009, and 2013
    cleaned = cleaned.replace('2014', '').replace('2009', '').replace('2013', '').replace('2007', '').replace('2010', '')
    # Replace spaces with underscores
    cleaned = cleaned.replace(' ', '_')
    # Replace "percent" with "pct"
    cleaned = cleaned.replace('percent', 'pct')
    # Replace "language" with "lang"
    cleaned = cleaned.replace('language', 'lang')
    # Convert to lowercase
    cleaned = cleaned.lower()
    # Replace "population" with "pop"
    cleaned = cleaned.replace('population', 'pop')
    # Remove the word "persons"
    # cleaned = cleaned.replace('persons', '')
    # Remove consecutive underscores
    cleaned = cleaned.replace('__', '_')
    # Remove underscores at the very end
    cleaned = cleaned.rstrip('_')
    return cleaned

county_dict['description_clean'] = np.vectorize(clean_description)(county_dict['description'])

In [11]:
county_dict['description_clean'].unique()

array(['pop_estimate', 'pop_april_1_estimates_base',
       'pop_pct_change_april_1_to_july_1', 'pop',
       'persons_under_5_years_pct', 'persons_under_18_years_pct',
       'persons_65_years_and_over_pct', 'female_persons_pct',
       'white_alone_pct', 'black_or_african_american_alone_pct',
       'american_indian_and_alaska_native_alone_pct', 'asian_alone_pct',
       'native_hawaiian_and_other_pacific_islander_alone_pct',
       'two_or_more_races_pct', 'hispanic_or_latino_pct',
       'white_alone_not_hispanic_or_latino_pct',
       'living_in_same_house_1_year_over_pct', 'foreign_born_persons_pct',
       'language_other_than_english_spoken_at_home_pct_age_5',
       'high_school_graduate_or_higher_pct_of_persons_age_25',
       'bachelors_degree_or_higher_pct_of_persons_age_25', 'veterans',
       'mean_travel_time_to_work_minutes_workers_age_16', 'housing_units',
       'homeownership_rate', 'housing_units_in_multiunit_structures_pct',
       'median_value_of_owneroccupied_ho

In [12]:
# Create a dictionary for faster column mapping
county_dict = county_dict.set_index('column_name').to_dict()['description_clean']

# Automate the column renaming process
votes.rename(columns=county_dict, inplace=True)

In [13]:
votes.columns

Index(['X', 'combined_fips', 'votes_dem_2016', 'votes_gop_2016',
       'total_votes_2016', 'Clinton', 'Trump', 'diff_2016',
       'per_point_diff_2016', 'state_abbr', 'county_name', 'FIPS',
       'total_votes_2012', 'votes_dem_2012', 'votes_gop_2012', 'county_fips',
       'state_fips', 'Obama', 'Romney', 'diff_2012', 'per_point_diff_2012',
       'fips', 'area_name', 'state_abbreviation', 'population2014',
       'population2010', 'population_change', 'pop',
       'persons_under_5_years_pct', 'persons_under_18_years_pct', 'age65plus',
       'female_persons_pct', 'White', 'Black',
       'american_indian_and_alaska_native_alone_pct', 'asian_alone_pct',
       'native_hawaiian_and_other_pacific_islander_alone_pct',
       'two_or_more_races_pct', 'Hispanic',
       'white_alone_not_hispanic_or_latino_pct',
       'living_in_same_house_1_year_over_pct', 'foreign_born_persons_pct',
       'NonEnglish', 'Edu_highschool', 'Edu_batchelors', 'veterans',
       'mean_travel_time_to_work_m

In [14]:
votes.shape

(3112, 81)