In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

In [None]:
df.info()

In [None]:
df.Potability.value_counts()

In [None]:
df.cov()

In [None]:
counts = df.Potability.value_counts()
# Sorting indices so it's easier to read 
counts.sort_index(inplace=True)

sns.barplot(x = counts.index, y = counts)
plt.ylabel('counts')

In [None]:
df.isnull().sum()

In [None]:
df['ph'].fillna(int(df['ph'].mean()), inplace=True)
df['Sulfate'].fillna(int(df['Sulfate'].mean()), inplace=True)
df['Trihalomethanes'].fillna(int(df['Trihalomethanes'].mean()), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
X = df.drop(['Potability'], axis=1)
X.shape

In [None]:
y = df['Potability']
y.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
st_scal = scaler.fit(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

# Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
pred_log = log_reg.predict(X_test)

In [None]:
acc_log= accuracy_score(pred_log,y_test)
acc_log

In [None]:
print(classification_report(y_test,pred_log))

# Random Forest Classifier

In [None]:
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train, y_train)

In [None]:
pred_rfc = model_rfc.predict(X_test)

In [None]:
acc_rfc = accuracy_score(pred_rfc, y_test)
acc_rfc

In [None]:
print(classification_report(pred_rfc, y_test))

In [None]:
cf_matrix_rfc = confusion_matrix(y_test, pred_rfc)
sns.heatmap(cf_matrix_rfc/np.sum(cf_matrix_rfc), annot=True,fmt= '0.2%')

# KNeighbors Classifier

In [None]:
model_neigh = KNeighborsClassifier()
model_neigh.fit(X_train, y_train)
pred_neigh = model_neigh.predict(X_test)

In [None]:
acc_neigh = accuracy_score(y_test, pred_neigh)
acc_neigh

In [None]:
print(classification_report(pred_neigh, y_test))

In [None]:
cf_matrix_neigh = confusion_matrix(y_test, pred_neigh)
sns.heatmap(cf_matrix_neigh/np.sum(cf_matrix_neigh), annot=True,fmt= '0.2%')

# Adaboost Classifier

In [None]:
model_adaB = AdaBoostClassifier(learning_rate=0.5)
model_adaB.fit(X_train, y_train)
pred_adaB = model_adaB.predict(X_test)

In [None]:
acc_adaB = accuracy_score(y_test, pred_adaB)
acc_adaB

In [None]:
print(classification_report(pred_adaB, y_test))

In [None]:
cf_matrix_adaB = confusion_matrix(y_test, pred_adaB)
sns.heatmap(cf_matrix_adaB/np.sum(cf_matrix_adaB), annot=True,fmt= '0.2%')

# SVC

In [None]:
model_svc = SVC(kernel='linear',gamma='auto')
model_svc.fit(X_train, y_train)
pred_svc = model_svc.predict(X_test)

In [None]:
acc_svc =  accuracy_score(y_test, pred_svc)
acc_svc

In [None]:
print(classification_report(pred_svc, y_test))

In [None]:
cf_matrix_svc = confusion_matrix(y_test, pred_svc)
sns.heatmap(cf_matrix_svc/np.sum(cf_matrix_svc), annot=True,fmt= '0.2%')

In [None]:
output = pd.DataFrame({"Model":['Logistic Regression','Random Forest Classifier','KNeighbors Classifier','Adaboost Classifier'],
           "Accuracy":[acc_log, acc_rfc, acc_neigh, acc_adaB]})

In [None]:
output

In [None]:
sns.barplot(x= 'Accuracy', y= 'Model', data= output)