In [None]:
import numpy as np 
import pandas as pd 
import time as tm

In [None]:
# Dataset utilizado: https://www.kaggle.com/adityakadiwal/water-potability?select=water_potability.csv

In [None]:
# Carregamos o dataset escolhido
water_potability = pd.read_csv("../input/water-potability/water_potability.csv")

In [None]:
# Ao visualizarmos os dados, odemos observar que o dataset possui alguns valores faltantes
water_potability.head()

In [None]:
# Calculando a quantidade de dados faltantes por coluna temos o seguinte resultado
water_potability.isnull().sum()

In [None]:
# O que faremos para solucionar o impasse é remover as linhas com dados faltantes
water_potability_filtered = water_potability.dropna() 
water_potability_filtered

In [None]:
# Após o filtro, não temos mais nenhuma informação não preenchida
water_potability_filtered.isnull().sum()

In [None]:
# Por fim, ajustamos os índices de nosso dataset
water_potability_filtered.reset_index().drop(columns=['index'])

In [None]:
import seaborn as sns

In [None]:
# Geramos gráficos de nossos atributos 2 a 2
sns.pairplot(water_potability_filtered[np.array(water_potability_filtered.columns)], hue='Potability')

In [None]:
classes = list(water_potability_filtered['Potability'])

print('Classe Não Potável:', classes.count(0))
print('Class Potável:', classes.count(1))
print('Proportion:', round(classes.count(0) / classes.count(1), 2), ': 1')

In [None]:
# Class count
count_class_0, count_class_1 = classes.count(0), classes.count(1)

# Divide by class
df_class_0 = water_potability_filtered[water_potability_filtered['Potability'] == 0]
df_class_1 = water_potability_filtered[water_potability_filtered['Potability'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
print('Random under-sampling:')

classes = list(df_test_under['Potability'])
count_class_0, count_class_1 = classes.count(0), classes.count(1)
print(count_class_0)
print(count_class_1)

In [None]:
# Divide by class
df_class_0 = water_potability_filtered[water_potability_filtered['Potability'] == 0]
df_class_1 = water_potability_filtered[water_potability_filtered['Potability'] == 1]

count_class_0, count_class_1 = len(df_class_0), len(df_class_1)

print(count_class_0)
print(count_class_1)

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
classes = list(df_test_under['Potability'])

count_class_0, count_class_1 = len(df_test_over[df_test_over['Potability'] == 0]), len(df_test_over[df_test_over['Potability'] == 1])

print(count_class_0)
print(count_class_1)

water_potability_filtered = df_test_over.reset_index().drop(columns=['index'])
water_potability_filtered

In [None]:
from imblearn.over_sampling import RandomOverSampler

columns = water_potability_filtered.columns.tolist()
columns.remove('Potability')

X = np.array(water_potability_filtered[columns])
y = np.array(water_potability_filtered['Potability'])

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

X = X_ros
y = y_ros

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X = np.array(water_potability_filtered[columns])
y = np.array(water_potability_filtered['Potability'])

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)

X = X_rus
y = y_rus

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neural_network import MLPClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Separamos as colunas referentes a nossos atributos da coluna referente a classificação

columns = water_potability_filtered.columns.tolist()
columns.remove('Potability')

X = np.array(water_potability_filtered[columns])
y = np.array(water_potability_filtered['Potability'])

In [None]:
# Dividimos o dataset aleatoriamente em 75% dos dados para treino, e 30% dos dados para teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=3)

In [None]:
# Geramos nossos 4 classificadores com auxílio das bibliotecas importadas
clf_svm = svm.SVC(kernel = 'sigmoid', C = 1)
clf_mlp = MLPClassifier(random_state = 1, learning_rate_init = 0.003 , max_iter = 10000)
clf_knn = KNeighborsClassifier(n_neighbors = 5)
clf_nb  = GaussianNB()

In [None]:
# Criamos arrays que irão armazenar os tempos de execução e acurácia de cada um dos algoritmos
tempos_execucao = []
acuracias = []
algoritmos = ['SVM', 'MLP', 'KNN', 'Naive Bayes']

In [None]:
# Executamos o SVM
init = tm.time()
clf_svm.fit(X_train, y_train)
end = tm.time()

tempo_exec = end - init

tempos_execucao.insert(0, tempo_exec)

print("SVM - Tempo de execução em segundos: ", tempo_exec)

In [None]:
# Executamos o MLP
init = tm.time()
clf_mlp.fit(X_train, y_train)
end = tm.time()

tempo_exec = end - init

tempos_execucao.insert(1, tempo_exec)

print("MLP - Tempo de execução em segundos: ", tempo_exec)

In [None]:
# Executamos o KNN
init = tm.time()
clf_knn.fit(X_train, y_train)
end = tm.time()

tempo_exec = end - init

tempos_execucao.insert(2, tempo_exec)

print("KNN - Tempo de execução em segundos: ", tempo_exec)

In [None]:
# Executamos o Naive Bayes
init = tm.time()
clf_nb.fit(X_train, y_train)
end = tm.time()

tempo_exec = end - init

tempos_execucao.insert(3, tempo_exec)

print("Naive Bayes - Tempo de execução em segundos: ", tempo_exec)

In [None]:
# Imprimindo as acurácias obtidas
svm_acuracia = clf_svm.score(X_test,y_test).round(4) * 100
mlp_acuracia = clf_mlp.score(X_test,y_test).round(4) * 100
knn_acuracia = clf_knn.score(X_test,y_test).round(4) * 100
nb_acuracia  = clf_nb.score(X_test,y_test).round(4) * 100

acuracias.insert(0, svm_acuracia)
acuracias.insert(1, mlp_acuracia)
acuracias.insert(2, knn_acuracia)
acuracias.insert(3, nb_acuracia)

print("SVM - Acurácia: ", svm_acuracia, "%")
print("MLP - Acurácia: ", mlp_acuracia, "%")
print("KNN - Acurácia: ", knn_acuracia, "%")
print("NB  - Acurácia: ", nb_acuracia, "%")

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Imprimindo detalhadamente os resulatados de cada um dos algoritmos 
print("\n SVM - Resultados: \n\n", classification_report(y_test, clf_svm.predict(X_test), target_names=['Não Potável', 'Potável']))
print("\n MLP - Resultados: \n\n", classification_report(y_test, clf_mlp.predict(X_test), target_names=['Não Potável', 'Potável']))
print("\n KNN - Resultados: \n\n", classification_report(y_test, clf_knn.predict(X_test), target_names=['Não Potável', 'Potável']))
print("\n Naive Bayes - Resultados: \n\n", classification_report(y_test, clf_nb.predict(X_test), target_names=['Não Potável', 'Potável']))

In [None]:
import matplotlib.pyplot as plt

# Geramos o gráfico Algoritmo x Tempo de Execução
plt.bar(algoritmos, tempos_execucao, color="blue")
plt.ylabel('Tempo de Execução (s)')
plt.xlabel('Algoritmo')
plt.title("Algoritmo x Tempo de Execução")
plt.show()

In [None]:
# Geramos o gráfico Algoritmo x Acurácia
plt.bar(algoritmos, acuracias, color="blue")
plt.ylabel('Acurácia (%)')
plt.xlabel('Algoritmo')
plt.title("Algoritmo x Acurácia")
plt.show()