In [None]:
# The election data can be found at https://www.tse.jus.br/eleicoes/estatisticas/repositorio-de-dados-eleitorais-1
# And scrictily the data used to predict mayor elections was https://cdn.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2020.zip

# Data consist of informations about candidates for the 2020 citywide elections. 


In [None]:
#Import packages
import wget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier

In [None]:
######IMPORTING DATA

wget.download('https://cdn.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2020.zip')

ZipFile('consulta_cand_2020.zip','r').extractall('consulta_cand_2020')

eleicao2020 = pd.read_csv(r'C:\Users\rian.lopes\analise\eleicoes\consulta_cand_2020\consulta_cand_2020_BRASIL.csv', engine = 'python',sep=';',decimal=',')

#Check the data
head = eleicao2020.head()

In [None]:
# SELECTING ONLY MAYOR POSTS. EXECUTIVE ELECTIONS USE MAJORITY SYSTEM OF VOTES WHICH MAKES THE PREDICTION EASIER SINCE 
# THERE'S ONLY TWO STATES "ELECTED, NOT ELECTED"

eleicao2020 = eleicao2020[eleicao2020.DS_CARGO  == 'PREFEITO']


eleicao2020 = eleicao2020.loc[eleicao2020['DS_SIT_TOT_TURNO'].isin(['ELEITO','NÃO ELEITO'])]

In [None]:
#SELECTING ONLY FEW FEATURES WHICH INCLUDES CAMPAING BUDGET, NUMBER OF THE PARTY, IF IT WAS REELECTION OR NOT E ETC

cols = ['NR_CANDIDATO','NM_UE','TP_AGREMIACAO','DS_COMPOSICAO_COLIGACAO','VR_DESPESA_MAX_CAMPANHA','ST_REELEICAO']

df = eleicao2020.loc[:,cols]

In [None]:
#CREATING THE RESPONSE VARABLE

df_base = df.copy()
df_base['RESULTADOS'] = eleicao2020.DS_SIT_TOT_TURNO.reset_index(drop=True)
df_base.to_csv('eleicoes_DB')


In [None]:
#CREATING LABELS FOR OBJECT TYPE COLUMNS

df1 = df.copy()

le = LabelEncoder()

for col in df1.columns:    
    if(df1[col].dtype == np.object):
        le.fit(df1[col].unique())
        df1[col] = le.transform(df1[col])


In [None]:
#APPLYING STANDARDIZATION TO THE DATA. MIN MAX WAS USED

padr = MinMaxScaler()

df1 = pd.DataFrame(padr.fit_transform(df1))

df1.columns = df.columns

df1['RESULTADOS'] = eleicao2020.DS_SIT_TOT_TURNO.reset_index(drop=True)

df1['RESULTADOS'] = np.where(df1['RESULTADOS']=='ELEITO',1,0)

In [None]:
#SPLITTING DATA

X_train, X_test, y_train, y_test = train_test_split(df1.iloc[:,df1.columns!='RESULTADOS'],\
                                                    df1['RESULTADOS'], test_size=0.3)

In [None]:
#TESTING SEVERAL CLASSIFIER ALGORITHMS

reg = LazyClassifier(verbose=2, ignore_warnings=False, custom_metric=None, predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

fig, ax = plt.subplots(figsize=(20,12))
chart = sns.lineplot(y=models['Balanced Accuracy'],x=models.index)
plt.xticks(rotation=45)

In [None]:
#TESTING INDIVIDUALLY ONE OF THE BESTS

model = PassiveAggressiveClassifier(C=0.001, class_weight='balanced',\
                            n_iter_no_change=20, tol=1e-03)

from sklearn.model_selection import cross_validate
cv_result = cross_validate(model, df1.iloc[:,df1.columns!='RESULTADOS'], df1['RESULTADOS'], cv=2,scoring=('balanced_accuracy'))

PRINT(cv_result['test_score'])

