In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings(action="ignore")
plt.style.use(["seaborn-bright","dark_background"])

In [None]:
data = pd.read_csv("../input/water-potability/water_potability.csv")
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
for i in data.columns:
    if i!="Potability":
        fig = px.histogram(data, x = i,color="Potability",height=400,width=700)
        fig.show()

In [None]:
sns.pairplot(data, hue="Potability")
plt.show()

In [None]:
before = data.describe()

In [None]:
for i in data.columns:
    if i!="Potability":
        fig = px.box(data,y = i,color="Potability",height=300, width=400)
        fig.show()

In [None]:
for i in data.columns:
    if i !="Potability":
        q1 = data[i].quantile(0.25)
        q3 = data[i].quantile(0.75)
        iqr = q3-q1
        lwr = q1-iqr*1.5
        upr = q3+iqr*1.5
        def func(value):
            if value<lwr:
                return lwr
            elif value>upr:
                return upr
            else:
                return value
        data[i] = data[i].apply(func)

In [None]:
after = data.describe()

In [None]:
before

In [None]:
after

In [None]:
cor = []
feature = []

In [None]:
for i in data.columns:
    if i!="Potability":
        c = data[i].corr(data["Potability"])
        cor.append(c)
        feature.append(i)

In [None]:
dt = pd.DataFrame()
dt["feature"] = feature
dt["corr"] = cor

In [None]:
df = dt.sort_values(by="corr",ascending=True)
fig = px.bar(df, x = "feature",y="corr",color="corr",title="Correlation Comparision With Potability")
fig.show()

In [None]:
data["Potability"].value_counts()

In [None]:
X = data.drop(columns=["Potability"])
y = data["Potability"]

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=101)

In [None]:
X = pd.concat([x_train,y_train], axis=1)

In [None]:
class_count_0, class_count_1 = X['Potability'].value_counts()

class_0 = X[X['Potability'] == 0]
class_1 = X[X['Potability'] == 1]
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

In [None]:
class_0_under = class_0.sample(class_count_1)

test_under = pd.concat([class_0_under, class_1], axis=0)
test_under["Potability"] = test_under["Potability"].astype("int")
print("total class of 1 and 0:\n",test_under['Potability'].value_counts())# plot the count after under-sampeling
test_under['Potability'].value_counts().plot(kind='bar', title='count (target)')

In [None]:
x_train = test_under.drop(columns=["Potability"])
y_train = test_under["Potability"]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB,GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
models = []
models.append(("DecisionTreeClassifier",DecisionTreeClassifier()))
models.append(("RandomForestClassifier",RandomForestClassifier()))
models.append(("ExtraTreesClassifier",ExtraTreesClassifier()))
models.append(("GradientBoostingClassifier",GradientBoostingClassifier()))
models.append(("SVC",SVC()))
models.append(("BernoulliNB",BernoulliNB()))
models.append(("GaussianNB",GaussianNB()))
models.append(("KNeighborsClassifier",KNeighborsClassifier()))
models.append(("LogisticRegression",LogisticRegression()))

In [None]:
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score

In [None]:
model_name = []
f1_Score = []
accuracy = []
precision_Score = []
recall = []

In [None]:
for name, model in models:
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    acc = accuracy_score(y_test,pred)
    model_name.append(name)
    accuracy.append(acc)
    f1score = f1_score(y_test,pred)
    f1_Score.append(f1score)
    pre_score = precision_score(y_test,pred)
    precision_Score.append(pre_score)
    rec_score = recall_score(y_test,pred)
    recall.append(rec_score)
    print("For {} F1 score = {} Accuracy = {} Precision = {} Recall = {}\n".format(name,acc,f1score,pre_score,rec_score))

In [None]:
d = pd.DataFrame()
d["model"] = model_name
d["accuracy"] = accuracy
d["f1_score"] = f1_Score
d["precision"] = precision_Score
d["recall"] = recall

In [None]:
df = d.sort_values(by="accuracy",ascending=True)
fig = px.line(df, x = "model",y="accuracy",title="Accuracy Comparision")
fig.show()

In [None]:
df = d.sort_values(by="f1_score",ascending=True)
fig = px.line(df, x = "model",y="f1_score",title="F1 Score Comparision")
fig.show()

In [None]:
df = d.sort_values(by="precision",ascending=True)
fig = px.line(df, x = "model",y="precision",title="Precision Comparision")
fig.show()

In [None]:
df = d.sort_values(by="recall",ascending=True)
fig = px.line(df, x = "model",y="recall",title="Recall Comparision")
fig.show()