In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# **DATA INPUT**

In [None]:
df = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
df.head()

# **Checking For Null Values**

In [None]:
df.isna().sum()

3 feattures have null values: 
* ph = 491
* Sulfate = 781
* Trihalomethanes = 162

In [None]:
print(df.shape)
df.describe().T

In [None]:
df.dtypes

# **Visualization**

**Histogram**

In [None]:
df.hist(bins = 30, figsize = (18, 25))

**Correlation & Heatmap**

In [None]:
corr = df.corr()
plt.figure(figsize=(16,6))
sns.heatmap(corr,annot=True)

**Countplot for Independent variable**

In [None]:
sns.countplot(x="Potability", data=df, saturation=0.7)
plt.xticks(ticks=[0,1],labels=['Non-Potable','Potable'])
plt.show()

**Pairplot based on Potability**

In [None]:
sns.set_theme(style="ticks")
sns.pairplot(df,hue="Potability")

**Denisty Curve**

In [None]:
for col in df.select_dtypes(include=['float64','int64']):
    plt.figure()
    sns.displot(df[col],kind='kde',height=3)
    plt.show()
plt.figure(figsize=(5,5))
sns.distplot(df['Potability'])

In [None]:
sns.displot(df, x="Hardness", y="ph", kind="kde",hue="Potability")

**Jointplot - JointGrid**

In [None]:
g = sns.PairGrid(df)
g.map_upper(sns.histplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)

# **Handling Null Values**

In [None]:
# Imputing missing values
df['ph'] = df['ph'].fillna(df['ph'].mean())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].mean())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())

In [None]:
df.isnull().sum()

**Splitting data into independent and dependent variables**

In [None]:
X = df.drop('Potability', axis=1)
y = df['Potability']
print(X.shape,y.shape)

In [None]:
potable = df[y == 1]
nonpotable = df[y == 0]
for col in df.select_dtypes(include=['float64','int64']):
    plt.figure(figsize=(4,4))
    sns.distplot(potable[col],label='Potable')
    sns.distplot(nonpotable[col],label='Non Potable')
    plt.legend()
    plt.show()

# **Splitting data into Test and Train**

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Potability',axis=1)
y = df['Potability']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4)

# **Model Selection**
**Importing Models**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler

**Making Pipeline**

In [None]:
preprocessor = make_pipeline(RobustScaler())
RandomPipeline = make_pipeline(preprocessor,RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42))
DescisionPipeline = make_pipeline(DecisionTreeClassifier(max_depth=5, random_state=42))
xgPipeline = make_pipeline(XGBClassifier(max_depth= 8, n_estimators= 125, random_state= 0,  learning_rate= 0.03, n_jobs=5))
SVMPipeline = make_pipeline(preprocessor,SVC(kernel='rbf', random_state = 42,probability=True))
KNNPipeline = make_pipeline(preprocessor,KNeighborsClassifier(n_neighbors=9, leaf_size=20))
LRPipeline = make_pipeline(preprocessor,LogisticRegression(max_iter=120,random_state=0, n_jobs=20))

**Model Dictionary**

In [None]:
dict_of_models = {'RandomForest': RandomPipeline,
'XGBoost': xgPipeline,
'SVM': SVMPipeline,
'KNN': KNNPipeline,
'DescisionForest': DescisionPipeline,
'LR': LRPipeline}

# **Model Evaluation**

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import learning_curve

def evaluation(model):
    model.fit(X_train, y_train)
    # calculating the probabilities
    y_pred_proba = model.predict_proba(X_test)

    # finding the predicted valued
    y_pred = np.argmax(y_pred_proba,axis=1)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy = ',acc)
    print('-')
    print(confusion_matrix(y_test,y_pred))
    print('-')
    print(classification_report(y_test,y_pred))
    print('-')
    
    N, train_score, val_score = learning_curve(model, X_train, y_train, 
                                               cv=4, scoring='f1', 
                                               train_sizes=np.linspace(0.1,1,10))
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()
    return acc
test = []
for name, model in dict_of_models.items():
    print('---------------------------------')
    print(name)
    t_acc={}
    acc1 = evaluation(model)
    t_acc[name]=acc1
    print(t_acc)
    test.append(t_acc)

# **Model Accuracy Comparison**

In [None]:
dataList = ([test[5].values(), test[4].values(), test[0].values(), test[1].values(), test[3].values(), test[2].values()])
models = pd.DataFrame(list(dataList),columns=['Accuracy Score'],index = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM'])
index = models.index
index.name = "Model"
models.head()
plt.figure(figsize=(16,6))
sns.barplot(x='Accuracy Score', y=index, data=models)

models.sort_values(by='Accuracy Score', ascending=False)

As it is observed, XGBoost hols the best accuracy as compared to other models viz., 67.39%