In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from scipy import stats
import statistics as stat
import matplotlib.pyplot as plt

## Load Data

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv",na_values="?")
df.head()

## Check Missing Value

In [None]:
df.isna().sum()

## Impute using KNN

In [None]:
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
df.head()

## Outlier check and handling outlier

In [None]:
def z_score_method(df, variable_name):
    #Takes two parameters: dataframe & variable of interest as string
    columns = df.columns
    z = np.abs(stats.zscore(df))
    threshold = 3
    outlier = []
    index=0
    for item in range(len(columns)):
        if columns[item] == variable_name:
            index = item
    for i, v in enumerate(z[:, index]):
        if v > threshold:
            outlier.append(i)
        else:
            continue
    return outlier

In [None]:
outlier = []
col = []
for i,k in enumerate(df.columns):
    outlier.append(z_score_method(df,k))
    if outlier[i] != []:
        col.append(k)

#handle outlier
ind = 0
for i in range(len(outlier)):
    if (outlier[i] == []):
        continue
    else:
        for j in (outlier[i]):
            df[col[ind]].values[j] = stat.median(df[col[ind]])
        ind += 1

## Correlation Matrix

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')

## Boxplot Visualization

In [None]:
f, axes = plt.subplots(3, 3, figsize=(15,8))
df.boxplot(column=['ph'],ax = axes[0,0])
df.boxplot(column=['Hardness'],ax = axes[0,1])
df.boxplot(column=['Solids'],ax = axes[0,2])
df.boxplot(column=['Chloramines'],ax = axes[1,0])
df.boxplot(column=['Sulfate'],ax = axes[1,1])
df.boxplot(column=['Conductivity'],ax = axes[1,2])
df.boxplot(column=['Organic_carbon'],ax = axes[2,0])
df.boxplot(column=['Trihalomethanes'],ax = axes[2,1])
df.boxplot(column=['Turbidity'],ax = axes[2,2])

In [None]:
def distributionPlot(dataset):
    """ 
    Creates distribution plot.
    """
    fig = plt.figure(figsize=(20, 20))
    for i in range(0, len(dataset.columns)):
        fig.add_subplot(np.ceil(len(dataset.columns)/3), 3, i+1)
        sns.distplot(dataset.iloc[:, i], color="lightcoral", rug=True)
        fig.tight_layout(pad=3.0)

In [None]:
distributionPlot(df.drop(['Potability'], axis=1))

## Split Train Test

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=100)

## Data Distribution

In [None]:
sns.countplot(x = "Potability",data = pd.concat([pd.DataFrame(X_train),pd.DataFrame(y_train)]))

## Standarize Data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)

## Machine Learning Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier',
       'GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier']
param_grid = {'n_estimators': [100, 200, 300, 400, 500, 1000], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False], 'criterion':['entropy', 'gini']}
value = [LogisticRegression(),KNeighborsClassifier(algorithm = 'kd_tree', n_jobs = 1, n_neighbors = 1, weights = 'uniform'),
         SVC(C=.5, gamma = 0.1,kernel = 'rbf'),
         DecisionTreeClassifier(),GridSearchCV(RandomForestClassifier(), param_grid, verbose=100, cv=10, n_jobs=-2),GradientBoostingClassifier(),AdaBoostClassifier(),xgb.XGBClassifier()]
models = dict(zip(key,value))
print(models)

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

## Deep Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from datetime import datetime

In [None]:
 #creating model
model = Sequential()

neuron_hidden = [100,75,30,1]
act_func = ['relu','relu','relu','sigmoid']

for i in range(len(neuron_hidden)):
    if i == 0:
        model.add(Dense(neuron_hidden[0], input_dim=x_train.shape[1], activation=act_func[0]))
    else:
        model.add(Dense(neuron_hidden[i], activation=act_func[i]))

######### compile the keras model #########
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs=100, batch_size=64, verbose=0, shuffle = False)
_,acc_train = model.evaluate(x_train,y_train,verbose = 0)
_,acc_test = model.evaluate(x_test,y_test, verbose = 0)
print('acc train: ',acc_train,'acc test: ',acc_test)

In [None]:
key.append('ANN')
predicted.append(acc_test)

plt.figure(figsize = (10,5))
sns.barplot(x = predicted, y = key)