In [80]:
pip install xlrd

In [135]:
#---------------------------------------------------------------------------------------------------------------
# Nom du projet     : Classification
# Nom du programme  : SDA_Projet_Classification                                                                         
# Description       : L’objectif est de prédire le parti gagnant des élections présidentielles de 2020 aux Etats
#                   : Unis à partir de données socio-démographiques                        
# Auteur            : Yaya KONE,Papa Moussa THIOUNE                                                                         
# Date de création  : Novembre 2022                       
#---------------------------------------------------------------------------------------------------------------

# Librairies utiles pour le data management
import pandas as pd 
import numpy as np
import datetime as dt           
import os, glob, re  

#---------------------------------------------------------------------------------------------------------------
# Partie 1 : Constitution des donnes
#---------------------------------------------------------------------------------------------------------------

# Importation des resultats
df_results_2020 = pd.read_csv('data/2020_US_County_Level_Presidential_Results.csv')
df_results_0816 = pd.read_csv('data/US_County_Level_Presidential_Results_08-16.csv')

# Creation de la variable target dans la base df_results_2020
df_results_2020['target'] = df_results_2020['diff'].apply(lambda x : 1 if x>0 else 0)

# lecture des fichiers xls et concatenation des donnees socio-demographiques
list_file_xls = glob.glob("data/*.xls")              # Liste des fichiers xls contenus dans le dossier data
df_socio = pd.read_excel(list_file_xls[0], header=4) # Lecture du 1er fichier de la liste 
df_socio = df_socio.set_index(df_socio.columns[0])   # On utilise la variable fips comme index
for i in range(1,len(list_file_xls)):
    if 'Population' in list_file_xls[i]:
        df = pd.read_excel(list_file_xls[i], header=2) 
    else: 
        df = pd.read_excel(list_file_xls[i], header=4) 
    df = df.set_index(df.columns[0])
    df_socio = pd.concat([df_socio, df], axis=1) 

# Creation de la base df_votes contenant les resultats de 2008 a 2016 et les donnees socio-demographiques
df_votes = df_results_0816.set_index(['fips_code'])
df_votes = pd.concat([df_votes, df_socio], axis=1, join="inner")

In [136]:
df_votes

Unnamed: 0,county,total_2008,dem_2008,gop_2008,oth_2008,total_2012,dem_2012,gop_2012,oth_2012,total_2016,...,"Percent of adults completing some college or associate's degree, 2000","Percent of adults with a bachelor's degree or higher, 2000","Less than a high school diploma, 2015-19","High school diploma only, 2015-19","Some college or associate's degree, 2015-19","Bachelor's degree or higher, 2015-19","Percent of adults with less than a high school diploma, 2015-19","Percent of adults with a high school diploma only, 2015-19","Percent of adults completing some college or associate's degree, 2015-19","Percent of adults with a bachelor's degree or higher, 2015-19"
26041,Delta County,19064,9974,8763,327,18043,8330,9533,180,18467,...,33.2,17.1,1563.0,8942.0,10417.0,5546.0,5.905244,33.784191,39.356960,20.953604
48295,Lipscomb County,1256,155,1093,8,1168,119,1044,5,1322,...,26.3,18.9,399.0,728.0,668.0,440.0,17.852348,32.572708,29.888144,19.686800
1127,Walker County,28652,7420,20722,510,28497,6551,21633,313,29243,...,23.3,9.1,8141.0,16796.0,14765.0,5080.0,18.179178,37.506142,32.970837,11.343843
48389,Reeves County,3077,1606,1445,26,2867,1649,1185,33,3184,...,13.8,8.0,3177.0,3433.0,2599.0,1291.0,30.257143,32.695236,24.752380,12.295238
56017,Hot Springs County,2546,619,1834,93,2495,523,1894,78,2535,...,30.1,17.9,184.0,1023.0,1433.0,822.0,5.314847,29.549393,41.392258,23.743502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17115,Macon County,51216,25487,24948,781,48742,22688,25249,805,47283,...,28.2,16.9,6968.0,25742.0,23933.0,16058.0,9.584462,35.408043,32.919765,22.087729
29215,Texas County,10851,3410,7215,226,10764,2871,7618,275,10935,...,20.9,10.8,3003.0,7089.0,5780.0,2470.0,16.372261,38.649002,31.512377,13.466361
46051,Grant County,3830,1786,1951,93,3606,1493,2034,79,3562,...,22.2,14.8,462.0,1901.0,1524.0,1088.0,9.286432,38.211056,30.633165,21.869347
17103,Lee County,16318,7765,8258,295,15275,6932,8046,297,15215,...,30.4,13.2,2804.0,8446.0,9181.0,4547.0,11.225879,33.813755,36.756344,18.204020


In [96]:
df_education = pd.read_excel('data/Education.xls', header=4)
df_pop = pd.read_excel('data/PopulationEstimates.xls', header=2)
df_pov = pd.read_excel('data/PovertyEstimates.xls', header=4)
df_unemp = pd.read_excel('data/Unemployment.xls', header=4)

In [137]:
df_results_0816

Unnamed: 0,fips_code,county,total_2008,dem_2008,gop_2008,oth_2008,total_2012,dem_2012,gop_2012,oth_2012,total_2016,dem_2016,gop_2016,oth_2016
0,26041,Delta County,19064,9974,8763,327,18043,8330,9533,180,18467,6431,11112,924
1,48295,Lipscomb County,1256,155,1093,8,1168,119,1044,5,1322,135,1159,28
2,1127,Walker County,28652,7420,20722,510,28497,6551,21633,313,29243,4486,24208,549
3,48389,Reeves County,3077,1606,1445,26,2867,1649,1185,33,3184,1659,1417,108
4,56017,Hot Springs County,2546,619,1834,93,2495,523,1894,78,2535,400,1939,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107,17115,Macon County,51216,25487,24948,781,48742,22688,25249,805,47283,18211,26782,2290
3108,29215,Texas County,10851,3410,7215,226,10764,2871,7618,275,10935,1728,8875,332
3109,46051,Grant County,3830,1786,1951,93,3606,1493,2034,79,3562,970,2381,211
3110,17103,Lee County,16318,7765,8258,295,15275,6932,8046,297,15215,5499,8597,1119
