In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,plot_roc_curve
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.shape

First Five rows of dataset

In [None]:
df.head()

Last five rows of dataset

## **Data Preprocessing**

In [None]:
df.info()

In [None]:
df.describe()

**Null values in dataset**

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)

Replacing Null values with mean

In [None]:
ndf = df
ndf['ph']=df['ph'].fillna(df['ph'].mean())
ndf['Sulfate']=df['Sulfate'].fillna(df['Sulfate'].mean())
ndf['Trihalomethanes']=df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())

In [None]:
ndf.isnull().sum()

####  **Visualization**

Count of Target variable

In [None]:
sns.countplot(x="Potability", data=ndf,palette="Set3")

Distribution of feature variable

In [None]:
plt.figure(figsize=(16,12))
cdf = ndf.drop('Potability',axis=1)
for i, column in enumerate(cdf.columns, 1):
    plt.subplot(3,3,i)
    sns.distplot(df[column])

Distribution of each feature column over taget variable

In [None]:
sns.displot(ndf, x="ph", hue="Potability", kind="kde", multiple="stack")

In [None]:
sns.displot(ndf, x="Hardness", hue="Potability", kind="kde", multiple="stack")

In [None]:
sns.displot(ndf, x="Solids", hue="Potability", kind="kde", multiple="stack")

In [None]:
sns.displot(ndf, x="Chloramines", hue="Potability", kind="kde", multiple="stack")

In [None]:
sns.displot(ndf, x="Sulfate", hue="Potability", kind="kde", multiple="stack")

In [None]:
sns.displot(ndf, x="Conductivity", hue="Potability", kind="kde", multiple="stack")

Pplot of two variables with bivariate and univariate graphsairwise relationships

In [None]:
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(ndf, hue="Potability",diag_kind="hist")
plt.show()

ph and Hardness with distribution over Potability

In [None]:
sns.jointplot(data=ndf, x="Hardness", y="ph",hue='Potability')

In [None]:
sns.displot(ndf, x="ph", y="Hardness", hue="Potability", kind="kde")

## **Modeling**

Spliting Feature and Target variable

In [None]:
X = ndf.drop('Potability',axis=1)
y=ndf['Potability'].copy()

**Data Splitting and Scaling**

split percentages include: **Train: 80%, Test: 20%**


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size = 0.2,random_state=42,shuffle = True)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models ={'RandomForestClassifier':RandomForestClassifier(),'GradientBoostingClassifier':GradientBoostingClassifier(),
        'AdaBoostClassifier':AdaBoostClassifier(),'LGBMClassifier':LGBMClassifier(),
         
        }

In [None]:
training_scores= []
testing_scores=[]

for key, value in models.items():
    value.fit(X_train_scaled, y_train)
    train_score= value.score(X_train_scaled,  y_train)
    test_score= value.score(X_test_scaled, y_test)
    training_scores.append(train_score)
    testing_scores.append(test_score)
    
    print(f"{key}\n")
    print("Training Accuracy: {0:.3f}".format(train_score*100))
    print("Training Accuracy: {0:.3f} \n".format(test_score*100))

**RandomForestClassifier** and **LGBMClassifier** performing well.

#### Hyperparameter Tuning of **Random Forest Classifier** 

In [None]:
model = RandomForestClassifier()
n_estimators = [10, 100, 500]
max_features = ['sqrt', 'log2']

# Grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_scaled, y_train)

#Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
rclf = RandomForestClassifier(max_features='log2',n_estimators= 500)
rclf.fit(X_train_scaled,y_train)
y_pred = rclf.predict(X_test_scaled)
print("Training Accuracy: {0:.3f}".format(rclf.score(X_train_scaled, y_train)*100))
print("Testing Accuracy: {0:.3f}".format(accuracy_score(y_test,y_pred)*100))


In [None]:
model = LGBMClassifier(boosting_type='dart')
max_bin=[255,300,350,450]
lr=[0.01,0.001,0.0001,0.11]
num_leaves=[31,100,250]

grid = dict(max_bin=max_bin,learning_rate =lr,num_leaves=num_leaves)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_scaled, y_train)

#Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
lgb = LGBMClassifier(learning_rate= 0.11, max_bin= 255, num_leaves= 31,boosting_type='dart')
lgb.fit(X_train_scaled,y_train)
y_pred = lgb.predict(X_test_scaled)
print("Training Accuracy: {0:.3f}".format(lgb.score(X_train_scaled, y_train)*100))
print("Testing Accuracy: {0:.3f}".format(accuracy_score(y_test,y_pred)*100))

In [None]:
models = [lgb, rclf]
ax = plt.gca()
for i in models:
    plot_roc_curve(i, X_test_scaled, y_test, ax=ax)
    
