In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
df = pd.read_csv('dados.csv')

def returnGender(g):
    if(g == 1):
        return "Masculino"
    else:
        return "Feminino"
    
def returnUF(u):
    return {
        11: "Rondônia",
        12: "Acre",
        13: "Amazonas",
        14: "Roraima",
        15: "Pará",
        16: "Amapá",
        17: "Tocantis",
        21: "Maranhão",
        22: "Piauí",
        23: "Ceará",
        24: "Rio Grande do Norte",
        25: "Paraíba",
        26: "Pernambuco",
        27: "Alagoas",
        28: "Sergipe",
        29: "Bahia",
        31: "Minas Gerais",
        32: "Espírito Santo",
        33: "Rio de Janeiro",
        35: "São Paulo",
        41: "Paraná",
        42: "Santa Catarina",
        43: "Rio Grande do Sul",
        50: "Mato Grosso do Sul",
        51: "Mato Grosso",
        52: "Goias",
        53: "Distrito Federal"
    }[u]

def returnEthnicGroup(e):
    return {
        0: "Indígena",
        2: "Branco",
        4: "Negro",
        6: "Asiático",
        8: "Pardo",
        9: "Sem Declaração"
    }[e]

def returnGruposDeIdade(i):
    if(i <= 18):
        return "Menor de 18 anos"
    elif(i <= 25):
        return "Entre 18 e 25 anos"
    elif(i <= 40):
        return "Entre 25 e 40 anos"
    elif (i <= 60):
        return "Entre 40 e 60 anos"
    else:
        return "Maior de 60 anos"

def returnAnosDeEstudo(e):
    if(e == 17):
        return "Não Informado"
    return e - 1

def returnClasseSocial(r):
    salario_minimo = 1100
    if(r <= salario_minimo):
        return "E"
    elif(r <= salario_minimo*3):
        return "D"
    elif(r <= salario_minimo*5):
        return "C"
    elif(r <= salario_minimo*15):
        return "B"
    else:
        return "A"


df['Sexo'] = df['Sexo'].apply(lambda x: returnGender(x))
df['UF'] = df['UF'].apply(lambda x: returnUF(x))
df['Cor'] = df['Cor'].apply(lambda x: returnEthnicGroup(x))
df['Idade'] = df['Idade'].apply(lambda x: returnGruposDeIdade(x))
df['Anos de Estudo'] = df['Anos de Estudo'].apply(lambda x: returnAnosDeEstudo(x))
df['Renda'] = df['Renda'].apply(lambda x: returnClasseSocial(x))

In [3]:
def calcularEntropia(df, coluna):
    values = df[coluna].unique()
    entropia = 0
    for i in values:
        pi = len(df[df[coluna].eq(i)])/len(df)
        entropia += pi*np.log2(pi)
        
    return -entropia
    
def calcularGanho(df, resultado, coluna):
    values = df[coluna].unique()
    ganho = 0
    for i in values:
        pi = len(df[df[coluna].eq(i)])/len(df)
        ganho = pi * calcularEntropia(df[df[coluna].eq(i)], resultado)
    ganho = calcularEntropia(df, resultado) - ganho
    return ganho

def calcularIndiceDeGiniParaValor(df, resultado, valor, coluna):
    gini = 0
    for value in df[resultado].unique():
        pi = len(df[df[coluna].eq(valor) & df[resultado].eq(value)])/len(df[df[coluna].eq(valor)])
        gini += pi * pi
    gini = 1 - gini
    return gini
    
def calcularIndiceDeGini(df, resultado, coluna):
    values = df[coluna].unique()
    gini = 0
    for i in values:
        pi = len(df[df[coluna].eq(i)])/len(df)
        gini += calcularIndiceDeGiniParaValor(df, coluna, i, resultado) * pi
    
    return gini

In [4]:
class No:
    def __init__(self, atributo, galhos):
        self.atributo = atributo
        self.galhos = galhos
        
def construirArvoreDeDecisao(df, colunas, resultado, resultadoAnterior=0, funcao=calcularGanho, dfOriginal=df):
    if(len(df[resultado]) == 0):
        return No(resultadoAnterior, '.')
    elif(len(df[resultado].unique()) == 1):
        return No(df[resultado].iloc[0], '.')
    elif(len(colunas) == 0):
        return No(df[resultado].value_counts()[:1].index[0], '.')
    
    maiorGanhoDeColunas = -np.inf
    for i in colunas:
        colunaEscolhida = i if maiorGanhoDeColunas < funcao(df, resultado, i) else maiorGanhoDeColunas
    
    galhos = {}
    for i in dfOriginal[colunaEscolhida].unique():
        galhos[i] = construirArvoreDeDecisao(df[df[colunaEscolhida].eq(i)], list(set(colunas) - set([colunaEscolhida])), resultado, df[resultado].value_counts().index[0], funcao, dfOriginal)
        
    return No(colunaEscolhida, galhos)

In [5]:
def buscarNaArvore(arvore, individuo):
    if(arvore.galhos == '.'):
        return arvore.atributo
    else:
        return buscarNaArvore(arvore.galhos[individuo[arvore.atributo]], individuo)

In [6]:
def kFoldValidation(k, df, funcao=calcularGanho):
    resto = len(df) % k
    resultadosExperimentos = []
    resultadosBusca = {}
    
    for i in range(0, k):
        df = df.sample(frac=1)
        teste = df.iloc[int((i/k)*len(df)):int(((i+1)/k)*len(df) + 1)]
        treinamento = df.iloc[:int((i/k)*len(df))].append(df.iloc[int(((i+1)/k)*len(df) + 1):])
        arvore = construirArvoreDeDecisao(treinamento, ['UF', 'Sexo', 'Idade', 'Cor', 'Anos de Estudo'], 'Renda', funcao)
        for r in df['Renda'].unique():
            resultadosBusca[r] = {}
            for f in df['Renda'].unique():
                resultadosBusca[r][f] = 0
        for e in teste.values:
            v = buscarNaArvore(arvore, {'UF': e[0], 'Sexo': e[1], 'Idade': e[2], 'Cor': e[3], 'Anos de Estudo': e[4]})
            resultadosBusca[e[5]][v] += 1
        
        dfAux = pd.DataFrame.from_dict(resultadosBusca)
        dfAux = dfAux.sort_index(ascending=True).sort_index(axis=1,ascending=True)
        dfAux['Total'] = dfAux.sum(axis=1)
        dfAux.loc['Total'] = dfAux.sum(axis=0)
        dfAux['Precisão'] = np.divide(np.diag(dfAux), dfAux['Total'])
        dfAux['Precisão']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
        dfAux['Reconhecimento'] = np.divide(np.diag(dfAux), dfAux.loc['Total'][:-1])
        dfAux['Reconhecimento']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
        resultadosExperimentos.append(dfAux.copy())
    
    return resultadosExperimentos

kfoldResultsEntropia = kFoldValidation(5, df)
kfoldResultsIndiceDeGini = kFoldValidation(5, df, calcularIndiceDeGini)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfAux['Precisão']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfAux['Reconhecimento']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']


In [7]:
pd.DataFrame.from_dict(kfoldResultsEntropia[0])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,0,1,1,1,3,0.0,0.0
B,22,133,80,85,22,342,0.388889,0.169211
C,1,25,18,47,29,120,0.15,0.0161
D,65,571,845,4123,2294,7898,0.522031,0.664571
E,7,57,174,1948,4820,7006,0.687982,0.672621
Total,95,786,1118,6204,7166,15369,0.591711,0.591711


In [8]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[0])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,0,0,1,1,2,0.0,0.0
B,39,139,98,99,36,411,0.3382,0.170343
C,0,28,17,59,29,133,0.12782,0.014991
D,55,589,849,3925,2315,7733,0.507565,0.642074
E,5,60,170,2029,4826,7090,0.680677,0.669627
Total,99,816,1134,6113,7207,15369,0.579543,0.579543


In [9]:
pd.DataFrame.from_dict(kfoldResultsEntropia[1])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,0,0,0,0,0,,0.0
B,27,135,95,101,23,381,0.354331,0.168329
C,0,22,21,48,40,131,0.160305,0.018851
D,66,588,848,4086,2298,7886,0.518133,0.661808
E,2,57,150,1939,4823,6971,0.691866,0.671353
Total,95,802,1114,6174,7184,15369,0.589824,0.589824


In [10]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[1])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,1,0,1,1,3,0.0,0.0
B,30,124,81,77,28,340,0.364706,0.157161
C,4,26,26,67,30,153,0.169935,0.023029
D,60,585,860,3962,2297,7764,0.510304,0.6411
E,5,53,162,2073,4816,7109,0.677451,0.6715
Total,99,789,1129,6180,7172,15369,0.58091,0.58091


In [11]:
pd.DataFrame.from_dict(kfoldResultsEntropia[2])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,1,0,2,0,3,0.0,0.0
B,27,129,73,104,42,375,0.344,0.165173
C,0,31,26,75,34,166,0.156627,0.023875
D,65,555,810,4093,2289,7812,0.523938,0.648447
E,5,65,180,2038,4725,7013,0.673749,0.666432
Total,97,781,1089,6312,7090,15369,0.583838,0.583838


In [12]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[2])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,0,0,3,1,4,0.0,0.0
B,40,143,98,108,38,427,0.334895,0.177861
C,4,17,25,63,26,135,0.185185,0.021834
D,67,583,866,3951,2282,7749,0.509872,0.643485
E,3,61,156,2015,4819,7054,0.683158,0.672481
Total,114,804,1145,6140,7166,15369,0.58156,0.58156


In [13]:
pd.DataFrame.from_dict(kfoldResultsEntropia[3])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,0,0,1,2,3,0.0,0.0
B,26,127,82,87,32,354,0.358757,0.162821
C,1,25,28,76,28,158,0.177215,0.025712
D,73,569,832,4132,2315,7921,0.521651,0.657543
E,4,59,147,1988,4735,6933,0.682966,0.665776
Total,104,780,1089,6284,7112,15369,0.587026,0.587026


In [14]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[3])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,2,0,0,2,4,0.0,0.0
B,40,145,88,112,27,412,0.351942,0.190289
C,2,24,22,63,29,140,0.157143,0.02011
D,72,527,831,4004,2301,7735,0.517647,0.647477
E,3,64,153,2005,4853,7078,0.685646,0.672906
Total,117,762,1094,6184,7212,15369,0.587156,0.587156


In [15]:
pd.DataFrame.from_dict(kfoldResultsEntropia[4])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,1,0,1,0,2,0.0,0.0
B,26,112,95,87,28,348,0.321839,0.140881
C,0,34,24,66,27,151,0.15894,0.021277
D,70,583,843,4068,2222,7786,0.522476,0.651714
E,7,65,166,2020,4823,7081,0.681118,0.679296
Total,103,795,1128,6242,7100,15368,0.587389,0.587389


In [16]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[4])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,0,2,0,0,2,4,0.0,0.0
B,40,135,84,99,39,397,0.34005,0.166461
C,2,28,15,62,28,135,0.111111,0.014205
D,56,581,791,4124,2391,7943,0.519199,0.659523
E,4,65,166,1968,4686,6889,0.680215,0.655751
Total,102,811,1056,6253,7146,15368,0.58303,0.58303


In [17]:
sklearnDf = pd.get_dummies(df[['Sexo','UF','Cor','Idade', 'Anos de Estudo']])
sklearnDf

Unnamed: 0,Sexo_Feminino,Sexo_Masculino,UF_Acre,UF_Alagoas,UF_Amapá,UF_Amazonas,UF_Bahia,UF_Ceará,UF_Distrito Federal,UF_Espírito Santo,...,Anos de Estudo_7,Anos de Estudo_8,Anos de Estudo_9,Anos de Estudo_10,Anos de Estudo_11,Anos de Estudo_12,Anos de Estudo_13,Anos de Estudo_14,Anos de Estudo_15,Anos de Estudo_Não Informado
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76835,0,1,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
76836,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
76837,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
76838,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
clf = DecisionTreeClassifier(criterion="entropy")

clf = clf.fit(sklearnDf[:15368*4], df.Renda[:15368*4])

In [19]:
def kFoldValidationScikit(k, df, criterio="entropy"):
    resto = len(df) % k
    resultadosExperimentos = []
    resultadosBusca = {}
    
    for i in range(0, k):
        df = df.sample(frac=1)
        teste = df.iloc[int((i/k)*len(df)):int(((i+1)/k)*len(df) + 1)]
        treinamento = df.iloc[:int((i/k)*len(df))].append(df.iloc[int(((i+1)/k)*len(df) + 1):])
        arvore = DecisionTreeClassifier(criterion=criterio).fit(pd.get_dummies(treinamento[['Sexo','UF','Cor','Idade', 'Anos de Estudo']]), treinamento['Renda'])
        for r in df['Renda'].unique():
            resultadosBusca[r] = {}
            for f in df['Renda'].unique():
                resultadosBusca[r][f] = 0
        prediction = clf.predict(pd.get_dummies(teste[['Sexo','UF','Cor','Idade', 'Anos de Estudo']]))       
        for e in range(len(prediction)):
            resultadosBusca[teste['Renda'].iloc[e]][prediction[e]] += 1 
        
        dfAux = pd.DataFrame.from_dict(resultadosBusca)
        dfAux = dfAux.sort_index(ascending=True).sort_index(axis=1,ascending=True)
        dfAux['Total'] = dfAux.sum(axis=1)
        dfAux.loc['Total'] = dfAux.sum(axis=0)
        dfAux['Precisão'] = np.divide(np.diag(dfAux), dfAux['Total'])
        dfAux['Precisão']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
        dfAux['Reconhecimento'] = np.divide(np.diag(dfAux), dfAux.loc['Total'][:-1])
        dfAux['Reconhecimento']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
        resultadosExperimentos.append(dfAux.copy())
    
    return resultadosExperimentos

kfoldResultsEntropia = kFoldValidationScikit(5, df, 'entropy')
kfoldResultsIndiceDeGini = kFoldValidationScikit(5, df, 'gini')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfAux['Precisão']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfAux['Reconhecimento']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfAux['Precisão']['Total'] = np.sum(np.diag(dfAux)[:-1])/dfAux['Total'].loc['Total']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pa

In [20]:
pd.DataFrame.from_dict(kfoldResultsEntropia[0])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,2,3,1,2,0,8,0.25,0.019608
B,49,248,133,182,43,655,0.378626,0.319588
C,7,30,124,98,27,286,0.433566,0.112319
D,42,433,668,3766,1434,6343,0.593725,0.603332
E,2,62,178,2194,5641,8077,0.698403,0.789503
Total,102,776,1104,6242,7145,15369,0.636411,0.636411


In [21]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[0])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,7,0,3,3,0,13,0.538462,0.063063
B,50,219,152,174,38,633,0.345972,0.298772
C,2,45,119,82,33,281,0.423488,0.105966
D,47,402,654,3798,1411,6312,0.601711,0.604008
E,5,67,195,2231,5632,8130,0.692743,0.791678
Total,111,733,1123,6288,7114,15369,0.636021,0.636021


In [22]:
pd.DataFrame.from_dict(kfoldResultsEntropia[1])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,3,1,1,0,1,6,0.5,0.029703
B,47,253,143,162,46,651,0.388633,0.319848
C,8,29,115,102,27,281,0.409253,0.098459
D,39,444,710,3774,1492,6459,0.584301,0.615059
E,4,64,199,2098,5607,7972,0.703337,0.781681
Total,101,791,1168,6136,7173,15369,0.634524,0.634524


In [23]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[1])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,6,3,0,3,0,12,0.5,0.052632
B,50,238,161,194,44,687,0.346434,0.297872
C,5,34,125,91,29,284,0.440141,0.108696
D,47,458,682,3647,1446,6280,0.580732,0.594361
E,6,66,182,2201,5651,8106,0.697138,0.788145
Total,114,799,1150,6136,7170,15369,0.628993,0.628993


In [24]:
pd.DataFrame.from_dict(kfoldResultsEntropia[2])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,6,2,0,3,2,13,0.461538,0.052632
B,64,244,146,166,44,664,0.36747,0.311622
C,3,24,114,90,33,264,0.431818,0.105166
D,40,456,646,3816,1450,6408,0.595506,0.610169
E,1,57,178,2179,5605,8020,0.698878,0.785674
Total,114,783,1084,6254,7134,15369,0.636671,0.636671


In [25]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[2])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,7,1,1,2,2,13,0.538462,0.071429
B,43,248,131,175,39,636,0.389937,0.32
C,6,39,118,83,29,275,0.429091,0.10461
D,37,425,678,3723,1444,6307,0.590296,0.606747
E,5,62,200,2153,5718,8138,0.70263,0.790653
Total,98,775,1128,6136,7232,15369,0.638558,0.638558


In [26]:
pd.DataFrame.from_dict(kfoldResultsEntropia[3])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,3,1,1,1,1,7,0.428571,0.029126
B,52,236,159,183,35,665,0.354887,0.302953
C,3,42,118,92,32,287,0.41115,0.100255
D,40,442,709,3786,1480,6457,0.58634,0.61035
E,5,58,190,2141,5559,7953,0.698982,0.782187
Total,103,779,1177,6203,7107,15369,0.631271,0.631271


In [27]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[3])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,3,0,0,1,0,4,0.75,0.031915
B,49,197,146,171,34,597,0.329983,0.255844
C,5,37,117,86,32,277,0.422383,0.101916
D,33,471,689,3757,1454,6404,0.586665,0.607437
E,4,65,196,2170,5652,8087,0.698899,0.788065
Total,94,770,1148,6185,7172,15369,0.632832,0.632832


In [28]:
pd.DataFrame.from_dict(kfoldResultsEntropia[4])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,3,0,1,2,1,7,0.428571,0.026316
B,62,276,164,167,48,717,0.384937,0.343711
C,1,42,105,88,27,263,0.39924,0.092756
D,40,428,669,3756,1471,6364,0.590195,0.605513
E,8,57,193,2190,5569,8017,0.694649,0.782603
Total,114,803,1132,6203,7116,15368,0.631767,0.631767


In [29]:
pd.DataFrame.from_dict(kfoldResultsIndiceDeGini[4])

Unnamed: 0,A,B,C,D,E,Total,Precisão,Reconhecimento
A,4,1,0,4,0,9,0.444444,0.037736
B,55,281,155,190,52,733,0.383356,0.362113
C,3,29,129,85,28,274,0.470803,0.111207
D,41,402,700,3661,1474,6278,0.583147,0.595963
E,3,63,176,2203,5629,8074,0.697176,0.783656
Total,106,776,1160,6143,7183,15368,0.631442,0.631442


In [30]:
df['Renda'].unique()

array(['E', 'D', 'C', 'B', 'A'], dtype=object)