In [1]:
# 
# KNN and Naive Bayes
# Hit Rate and Individual ROC and AUC 
#

In [1]:
# Load the libraries

import pandas as pd
import numpy as np
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [2]:
# Just to adjust our plots inside Jupyter

% matplotlib inline

In [3]:
# reading the data

df = pd.read_csv('domicilios-rio2010.csv')
df.head()

Unnamed: 0,Setor,num_Habitantes,AguaADEQUADA,EsgotoADEQUADO,Medidor_Eletric,LixoADEQUADO,Analfa10a14anos,Analfa15ouMais,RespRenda_Ate1Sal,RendaPerCapita,DomRendaMedia,Classe_Econ
0,60026,266.935012,100.0,100.0,100.0,100.0,2.325581,3.267045,20.848057,742.749412,2230.872792,D
1,60027,242.484728,100.0,100.0,98.795181,100.0,0.0,3.161398,30.522088,666.727031,2010.891566,D
2,60028,149.989149,100.0,100.0,98.378378,100.0,0.0,1.196172,25.405405,571.290323,1627.405405,D
3,60029,381.858078,100.0,100.0,99.693252,98.466258,4.166667,0.882029,25.153374,585.449177,1855.119632,D
4,60030,296.154714,100.0,100.0,99.689441,100.0,1.369863,3.552632,21.73913,642.935079,1906.841615,D


In [6]:
# Cleaning the data

df.dropna(how='any',inplace=True)

In [7]:
df.describe()

Unnamed: 0,Setor,num_Habitantes,AguaADEQUADA,EsgotoADEQUADO,Medidor_Eletric,LixoADEQUADO,Analfa10a14anos,Analfa15ouMais,RespRenda_Ate1Sal,RendaPerCapita,DomRendaMedia,Classe_Econ_A,Classe_Econ_B,Classe_Econ_C,Classe_Econ_D,Classe_Econ_E
count,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0
mean,214434.681227,403.73835,98.00345,93.990459,91.859698,99.100153,1.682762,2.951894,18.818448,1260.128118,3368.483338,0.024354,0.097712,0.223033,0.499704,0.155196
std,81666.00074,1685.929194,9.926592,17.779403,16.486015,5.851832,2.965583,3.252465,14.185975,1282.416576,3136.977805,0.154154,0.29694,0.416301,0.500025,0.362109
min,60001.0,0.02318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.232759,318.948718,0.0,0.0,0.0,0.0,0.0
25%,150207.25,139.773116,99.672131,98.626606,92.638701,100.0,0.0,0.739372,6.25,443.660926,1397.637662,0.0,0.0,0.0,0.0,0.0
50%,210717.5,223.189231,100.0,100.0,99.02439,100.0,0.0,1.782796,16.954108,727.959622,2168.869013,0.0,0.0,0.0,0.0,0.0
75%,270064.75,448.697772,100.0,100.0,100.0,100.0,2.564103,4.103595,28.378378,1484.602336,3996.806396,0.0,0.0,0.0,1.0,0.0
max,390227.0,157308.606454,100.0,100.0,100.0,100.0,60.0,45.241199,100.0,14730.559611,36879.677725,1.0,1.0,1.0,1.0,1.0


In [8]:
# Getting the dummies for Classe_Econ

df = pd.get_dummies(df)
df.head()

Unnamed: 0,Setor,num_Habitantes,AguaADEQUADA,EsgotoADEQUADO,Medidor_Eletric,LixoADEQUADO,Analfa10a14anos,Analfa15ouMais,RespRenda_Ate1Sal,RendaPerCapita,DomRendaMedia,Classe_Econ_A,Classe_Econ_B,Classe_Econ_C,Classe_Econ_D,Classe_Econ_E
0,60026,266.935012,100.0,100.0,100.0,100.0,2.325581,3.267045,20.848057,742.749412,2230.872792,0,0,0,1,0
1,60027,242.484728,100.0,100.0,98.795181,100.0,0.0,3.161398,30.522088,666.727031,2010.891566,0,0,0,1,0
2,60028,149.989149,100.0,100.0,98.378378,100.0,0.0,1.196172,25.405405,571.290323,1627.405405,0,0,0,1,0
3,60029,381.858078,100.0,100.0,99.693252,98.466258,4.166667,0.882029,25.153374,585.449177,1855.119632,0,0,0,1,0
4,60030,296.154714,100.0,100.0,99.689441,100.0,1.369863,3.552632,21.73913,642.935079,1906.841615,0,0,0,1,0


In [5]:
# Getting the dummies for Classe_Econ

df = pd.get_dummies(df)
df.head()
df.describe()

Unnamed: 0,Setor,num_Habitantes,AguaADEQUADA,EsgotoADEQUADO,Medidor_Eletric,LixoADEQUADO,Analfa10a14anos,Analfa15ouMais,RespRenda_Ate1Sal,RendaPerCapita,DomRendaMedia,Classe_Econ_A,Classe_Econ_B,Classe_Econ_C,Classe_Econ_D,Classe_Econ_E
count,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0,10142.0
mean,214434.681227,403.73835,98.00345,93.990459,91.859698,99.100153,1.682762,2.951894,18.818448,1260.128118,3368.483338,0.024354,0.097712,0.223033,0.499704,0.155196
std,81666.00074,1685.929194,9.926592,17.779403,16.486015,5.851832,2.965583,3.252465,14.185975,1282.416576,3136.977805,0.154154,0.29694,0.416301,0.500025,0.362109
min,60001.0,0.02318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.232759,318.948718,0.0,0.0,0.0,0.0,0.0
25%,150207.25,139.773116,99.672131,98.626606,92.638701,100.0,0.0,0.739372,6.25,443.660926,1397.637662,0.0,0.0,0.0,0.0,0.0
50%,210717.5,223.189231,100.0,100.0,99.02439,100.0,0.0,1.782796,16.954108,727.959622,2168.869013,0.0,0.0,0.0,0.0,0.0
75%,270064.75,448.697772,100.0,100.0,100.0,100.0,2.564103,4.103595,28.378378,1484.602336,3996.806396,0.0,0.0,0.0,1.0,0.0
max,390227.0,157308.606454,100.0,100.0,100.0,100.0,60.0,45.241199,100.0,14730.559611,36879.677725,1.0,1.0,1.0,1.0,1.0


In [None]:
# We will use eigth features:
# We will compare 'Classe_Econ_C' vs. the Others

chosenColumns = ['AguaADEQUADA', 'EsgotoADEQUADO','Medidor_Eletric', 'LixoADEQUADO', 'Analfa10a14anos', 
                  'Analfa15ouMais','RespRenda_Ate1Sal','RendaPerCapita']
myX = df.as_matrix(columns=chosenColumns)
myY = df.as_matrix(columns=['Classe_Econ_C'])

## Classification

In [None]:
# Splitting the dataset into train and test

xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3)
testSize = yTest.shape[0]

In [None]:
# Naive Bayes

nb = naive_bayes.GaussianNB()
nb.fit(xTrain, yTrain.ravel())
yPredNB = nb.predict(xTest) # predicting test data

# computing error
errorNB = np.sum((yPredNB[i] != yTest[i]) for i in range(0, testSize))
errorNBPCT = int(100*errorNB/testSize)
hitRateNBPCT = 100 - errorNBPCT

print("----------Naive Bayes----------")
print(int(errorNB), "misclassified data out of", testSize)
print("Error PCT: ",errorNBPCT,'%')
print("Hit Rate:  ",hitRateNBPCT,'%')

In [None]:
# K Nearest Neighbors

myK = 5

knn =  KNeighborsClassifier(n_neighbors=myK)
knn.fit(xTrain, yTrain.ravel())
yPredKNN = knn.predict(xTest) # predicting test data

# computing error
errorKNN = np.sum((yPredKNN[i] != yTest[i]) for i in range(0, testSize))
errorKNNPCT = int(100*errorKNN/testSize)
hitRateKNNPCT = 100 - errorKNNPCT

print("----------K Nearest Neighbors----------")
print(int(errorKNN), "misclassified data out of", testSize)
print("Error PCT: ",errorKNNPCT,'%')
print("Hit Rate:  ",hitRateKNNPCT,'%')

In [None]:
# Constructing the ROC curve and AUC

fpr, tpr, thresh = roc_curve(yTest.ravel(), yPredKNN.ravel())
rocAuc = auc(fpr, tpr)

In [None]:
# Plotting Everything

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % rocAuc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()