In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df= pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Null-Values

In [None]:
df['ph'].hist()

In [None]:
df['ph']= df['ph'].fillna(df['ph'].mean())

In [None]:
df['Sulfate'].hist()

In [None]:
df['Sulfate']= df['Sulfate'].fillna(df['Sulfate'].mean())

In [None]:
df['Trihalomethanes'].hist()

In [None]:
df['Trihalomethanes']= df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())

# Exploratory Data Analysis

In [None]:
df.hist(figsize=(15,15))
plt.show()

**The data looks normalized already**

In [None]:
sns.pairplot(df, hue='Potability')

**Yes, the data is pretty well distributed and normalized**

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True)

**There aren't highly co-related features**

In [None]:
from sklearn.utils import shuffle

df= shuffle(df)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X= df.drop('Potability', axis=1)
y= df['Potability']

**We still scale the data for better results**

In [None]:
from sklearn.preprocessing import StandardScaler

ss=StandardScaler()
X= ss.fit_transform(X)

In [None]:
X= pd.DataFrame(X, columns= [col for col in df.columns if col!='Potability' ])

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, stratify=y)

# Training our Models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [None]:
key= ['KNeighborsClassifier', 'LogisticRegression', 'RandomForestClassifier', 'GaussianNB', 'DecisionTreeClassifier', 'XGBClassifier', 'SVC']
value= [KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(), GaussianNB(), DecisionTreeClassifier(), XGBClassifier(), SVC()]

models= dict(zip(key,value))

In [None]:
training_scores= []
testing_scores=[]

for key, value in models.items():
    value.fit(X_train, y_train)
    train_score= value.score(X_train,  y_train)
    test_score= value.score(X_test, y_test)
    training_scores.append(train_score)
    testing_scores.append(test_score)
    
    print(f"{key}\n")
    print(f"Training Score: {train_score}" )
    print(f"Testing Score: {test_score} \n")

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores= []

for key, value in models.items():
    cvs=cross_val_score(value, X,y, cv=5)
    
    cv_scores.append(cvs.mean())
    print(f"{key}\n")
    print(f"CV Score: {cvs.mean()} \n" )

# SVC and RandomForestClassifier are performing best

# Hyperparameter Tuning

In [None]:
random_states=[]

svc= SVC()
rfc= RandomForestClassifier()

svc_acc=[]
rfc_acc=[]

for i in range(1,150,1):
    xtrain, xtest, ytrain,ytest= train_test_split(X,y,random_state=i, test_size=0.2, stratify=y)
    svc.fit(xtrain,ytrain)
    rfc.fit(xtrain,ytrain)
    svc_acc.append(svc.score(xtest,ytest))
    rfc_acc.append(rfc.score(xtest,ytest))
    random_states.append(i)
    



In [None]:
plt.plot(random_states, svc_acc)

In [None]:
plt.plot(random_states,rfc_acc)

In [None]:
svc_acc.index(max(svc_acc))

In [None]:
m= rfc_acc.index(max(rfc_acc))

In [None]:
r= random_states[m]

**Overall, RandomForest seems to be working slightly better than SVC**

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y,random_state=r, stratify=y, test_size=0.2)

rfc.fit(X_train, y_train)
rfc.score(X_test,y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rfc= RandomForestClassifier(random_state=r)

params={'n_estimators':[10,100,200,500],
       'max_depth':[5,10,20,80],
       'min_samples_leaf':[1,10,25]}

random= RandomizedSearchCV(rfc, param_distributions=params,cv=5, random_state=r)

In [None]:
random.fit(X_train,y_train)

In [None]:
best= random.best_estimator_

In [None]:
best.fit(X_train,y_train)

y_pred= best.predict(X_test)
best.score(X_test,y_test)

# Result

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


In [None]:
plot_roc_curve(best, X_test,y_test)

# Upvote the notebook if you liked :)