In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()/100

In [None]:
df['ph'] = df['ph'].fillna(df['ph'].mean())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].mean())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
sns.countplot(data=df, x='Potability')

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.pairplot(df)

In [None]:
X= df.drop('Potability',axis=1)
y= df['Potability']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
def print_cr(y_test,y_pred):
    print(classification_report(y_test,y_pred))
    

In [None]:
def plot_cm(y_test,y_pred):
    cm = confusion_matrix(y_test,y_pred)
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:No','Predicted:Yes'], 
                           index = ['Actual:No','Actual:Yes'])
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**LOGISTIC REGRESSION**

In [None]:
lr= LogisticRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
print_cr(y_test,y_pred_lr)

In [None]:
plot_cm(y_test,y_pred_lr)

In [None]:
accuracy_score(y_test,y_pred_lr)
acc_lr = round(accuracy_score(y_test,y_pred_lr) * 100, 2)
acc_lr

**DECISION TREE CLASSIFIER**

In [None]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)


In [None]:
print_cr(y_test,y_pred_dt)

In [None]:
plot_cm(y_test,y_pred_dt)

In [None]:
accuracy_score(y_test,y_pred_dt)
acc_dt = round(accuracy_score(y_test,y_pred_dt) * 100, 2)
acc_dt

**RANDOM FOREST CLASSIFIER**

In [None]:
rf= RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
print_cr(y_test,y_pred_rf)

In [None]:
plot_cm(y_test,y_pred_rf)

In [None]:
accuracy_score(y_test,y_pred_rf)
acc_rf = round(accuracy_score(y_test,y_pred_rf) * 100, 2)
acc_rf

**ADA BOOST CLASSIFIER**

In [None]:
abc = AdaBoostClassifier(random_state = 1)
abc.fit(X_train,y_train)
y_pred_abc = abc.predict(X_test)


In [None]:
print_cr(y_test,y_pred_abc)

In [None]:
plot_cm(y_test,y_pred_abc)

In [None]:
accuracy_score(y_test,y_pred_abc)
acc_abc = round(accuracy_score(y_test,y_pred_abc) * 100, 2)
acc_abc

**GRADIENT BOOSTING CLASSIFIER**

In [None]:
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
y_pred_gbc = gbc.predict(X_test)

In [None]:
print_cr(y_test,y_pred_gbc)

In [None]:
plot_cm(y_test, y_pred_gbc)

In [None]:
accuracy_score(y_test,y_pred_gbc)
acc_gbc = round(accuracy_score(y_test,y_pred_gbc) * 100, 2)
acc_gbc

**XG BOOST CLASSIFIER**

In [None]:
xgb= XGBClassifier(random_state=1)
xgb.fit(X_train,y_train)
y_pred_xgb = xgb.predict(X_test)

In [None]:
print_cr(y_test,y_pred_xgb)

In [None]:
plot_cm(y_test, y_pred_xgb)

In [None]:
accuracy_score(y_test,y_pred_xgb)
acc_xgb = round(accuracy_score(y_test,y_pred_xgb) * 100, 2)
acc_xgb

In [None]:
models = pd.DataFrame({
'Model': ['Logistic Regression','Decision Tree',
              'Random Forest','ADA Boost','Gredient Boost','XG Boost'],
    'Accuracy': [acc_lr,acc_dt,
              acc_rf,acc_abc,acc_gbc,acc_xgb]})
models.sort_values(by='Accuracy', ascending=False)