In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score,\
RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, plot_roc_curve,\
plot_confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import xgboost as xgb

import math

from sklearn.pipeline import Pipeline

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
df.head()

In [None]:
null = (df.isnull().sum()/len(df))*100
print('Percentage of null values \n\n',null)

In [None]:
null_index = null[null.values<15].index
for i in null_index:
    df[i] = df[i].fillna(df[i].median())
    
df.head()

In [None]:
null = (df.isnull().sum()/len(df))*100
print('Percentage of null values \n\n',null)

In [None]:
null_index = null[null.values>15].index

for i in null_index:
    df = df[~df[i].isnull()]

In [None]:
df.info()

In [None]:
df.Potability.value_counts()

In [None]:
X = df.iloc[:,:-1]
y = df.Potability

smote = SMOTE(random_state=72)
X_smote, y_smote = smote.fit_resample(X,y)

df = pd.concat([X_smote,y_smote], axis=1)
df.head()

In [None]:
df.Potability.value_counts()

In [None]:
df.info()

In [None]:
df.Potability.value_counts()

In [None]:
# sns.pairplot(df)

In [None]:
outlier_cols = df.columns[:-1]

plt.figure(figsize=(20,8))
for i in enumerate(outlier_cols):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(df[i[1]])
plt.show()

In [None]:
df.shape

## Splitting and scaling data

In [None]:
train, test = train_test_split(df, train_size=0.7, random_state=100)

X_train = train.drop('Potability', axis=1)
y_train = train.Potability

X_test = test.drop('Potability', axis=1)
y_test = test.Potability





In [None]:
scaler = MinMaxScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
X_test[X_test.columns] = scaler.transform(X_test[X_test.columns])

In [None]:
X_train.describe()

In [None]:
X_test.describe()

## Logistic Regression

In [None]:
lr = LogisticRegression()
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
cv_score = cross_val_score(estimator=lr, X=X_train, y=y_train, cv=folds, scoring='accuracy')
cv_score.mean()

In [None]:
lg_reg = LogisticRegression()
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
hyper_Param = {'penalty' :['l1', 'l2', 'elasticnet', 'none'],
              'C':[0,0.0001,0.01,0.2,0.4,0.6,0.8]}

grid = GridSearchCV(estimator=lg_reg, param_grid=hyper_Param, cv=folds, scoring='accuracy',
                   verbose=1, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

## XGBoost

In [None]:
xb = xgb.XGBClassifier(random_state=20,eval_metric='mlogloss')
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
cv_score = cross_val_score(estimator=xb, X=X_train, y=y_train, cv=folds, scoring='accuracy')
cv_score.mean()

In [None]:
xb = xgb.XGBClassifier(random_state=20,eval_metric='mlogloss')
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
hyper_Params = {'n_estimators':[100,200,300,400],
                'learning_rate': [0.05,0.08,0.1,0.2,0.3],
               'max_depth': [3,5,7,10,13,15,20],
               'gamma': [0,0.1,0.3,0.5,0.7],
               'reg_lambda':[0.1,0.2,0.4,0.6,0.8]}

rnd_xb = RandomizedSearchCV(estimator=xb, param_distributions=hyper_Params, n_iter=10, n_jobs=-1, cv=folds,
                        scoring='accuracy', verbose=3)
rnd_xb.fit(X_train, y_train)

In [None]:
rnd_xb.best_score_

## SVM

In [None]:
sv = SVC()
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
cv_score = cross_val_score(estimator=sv, X=X_train, y=y_train, cv=folds, scoring='accuracy')
cv_score.mean()

In [None]:
sv = SVC()
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
# hyper_Params = {'C':[0.000000001,0.000001,0.0001,0.01,0.1],
#                'gamma':[0.00001,0.00001,0.001,0.1]}
hyper_Params = [ {'kernel': ['rbf','poly','linear'],
                    'gamma': [1e-2, 1e-3, 1e-4, 0.1, 0.2, 0.5, 0.8,0.85,0.9,1],
                     'C': [1, 10, 100, 1000]}]

rnd_svc = RandomizedSearchCV(estimator=sv, param_distributions=hyper_Params, n_iter=100, n_jobs=-1, cv=folds,
                        scoring='accuracy', verbose=3)
rnd_svc.fit(X_train, y_train)

In [None]:
rnd_svc.best_score_

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=60)
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)
cv_score = cross_val_score(estimator=rf, X=X_train, y=y_train, cv=folds, scoring='accuracy')
cv_score.mean()

In [None]:
rf = RandomForestClassifier(random_state=60)
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=100)

hyper_Params = {'criterion':['gini','entropy'],
               'min_samples_split':[50,60,100],
               'min_samples_leaf':[10,15,25,30],
               'max_depth':[2,4,7,10,15]}

grid_rf = GridSearchCV(estimator=rf, param_grid=hyper_Params, n_jobs=-1, cv=folds,
                        scoring='accuracy', verbose=3)
grid_rf.fit(X_train, y_train)

In [None]:
grid_rf.best_score_

In [None]:
models = []

lr = Pipeline([('lr', grid.best_estimator_)])
models.append(('lr',lr))

xb = Pipeline([('xb', rnd_xb.best_estimator_)])
models.append(('xb',xb))

sv = Pipeline([('sv', rnd_svc.best_estimator_)])
models.append(('sv',sv))

rf = Pipeline([('rf', RandomForestClassifier(random_state=60))])
models.append(('rf',rf))


vote = VotingClassifier(models, voting='hard')

vote.fit(X_train,y_train)
y_pred = vote.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
plot_confusion_matrix(vote, X_test, y_test)
plt.show()

In [None]:
conf = confusion_matrix(y_test,y_pred)

specificity = conf[0,0]/conf[0].sum()
print('Specificity =',round(specificity*100,2),'%')
print('Accuracy =',round(accuracy_score(y_test, y_pred)*100,2),'%')