<br>
<h1 style = "font-size:40px; font-family:Garamond ; font-weight : normal; background-color: #C66363 ; color : #E8D6D8; text-align: center; border-radius: 100px 100px;">CONTENT </h1>
<br>

* [Add Libaries](#1)
* [Load and Examine Data](#2)
* [Quick Look With Visualization](#3)
* [Processing Data](#4)
* [Model Training](#5)

<a id="1"> </a>
# Add Libaries

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

<a id="2"> </a>
# Load and Examine Data

In [None]:
train_df = pd.read_csv("../input/water-potability/water_potability.csv")

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.info()

<a id="3"> </a>
# Quick Look With Visualization

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(18, 14))

fig.suptitle('Risk Analysis')

sns.boxplot(ax=axes[0, 0], data=train_df, x='Potability', y='ph',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[0, 1], data=train_df, x='Potability', y='Hardness',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[0, 2], data=train_df, x='Potability', y='Solids',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[1, 0], data=train_df, x='Potability', y='Chloramines',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[1, 1], data=train_df, x='Potability', y='Sulfate',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[1, 2], data=train_df, x='Potability', y='Conductivity',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[2, 0], data=train_df, x='Potability', y='Organic_carbon',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[2, 1], data=train_df, x='Potability', y='Trihalomethanes',palette=("ch:start=.2,rot=-.3"))
sns.boxplot(ax=axes[2, 2], data=train_df, x='Potability', y='Turbidity',palette=("ch:start=.2,rot=-.3"))

In [None]:
corr_df=train_df
corrMatrix = corr_df.corr()
sns.heatmap(corrMatrix, annot=True ,cmap="Blues")
sns.set(rc={'figure.figsize':(10,10)})

<a id="4"> </a>
# Processing Data

In [None]:
def detect_miss(df,features):
    miss_indices = []
    
    for c in features:
        
        miss_list_col = df[df[c].isna()].index
        miss_indices.extend(miss_list_col)
    
    miss_indices = Counter(miss_indices)
    multiple_miss = list(i for i, v in miss_indices.items() if v > 2)
    
    return multiple_miss

In [None]:
train_df.loc[detect_miss(train_df,['ph','Hardness','Solids','Sulfate','Conductivity','Organic_carbon','Trihalomethanes' ,'Turbidity'])]

In [None]:
train_df = train_df.drop(detect_miss(train_df,['ph','Hardness','Solids','Sulfate','Conductivity','Organic_carbon','Trihalomethanes' ,'Turbidity']),axis = 0).reset_index(drop = True)

In [None]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        Q1 = np.percentile(df[c],25)
        Q3 = np.percentile(df[c],75)
        IQR = Q3 - Q1
        outlier_step = IQR * 1.5
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
train_df.loc[detect_outliers(train_df,['ph','Hardness','Solids','Sulfate','Conductivity','Organic_carbon','Trihalomethanes' ,'Turbidity'])]

In [None]:
train_df.isnull().sum()

In [None]:
train_df["ph"] = train_df["ph"].fillna(np.mean(train_df["ph"]))
train_df["Sulfate"] = train_df["Sulfate"].fillna(np.mean(train_df["Sulfate"]))
train_df["Trihalomethanes"] = train_df["Trihalomethanes"].fillna(np.mean(train_df["Trihalomethanes"]))

<a id="5"> </a>
# Model Training

In [None]:
Y = train_df[['Potability']]
X = train_df.drop(['Potability'],axis=1)

In [None]:
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X = pd.DataFrame(X)

In [None]:
Y["Potability"] = train_df["Potability"].astype("category")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log_train = round(logreg.score(X_train, y_train)*100,2) 
acc_log_test = round(logreg.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Testing Accuracy: % {}".format(acc_log_test))

In [None]:
random_state = 42
classifier = [GradientBoostingClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier()]

gb_param_grid = {"n_estimators":[5,50,250,500],
                 "max_depth":[1,3,5,7,9],
                 "learning_rate":[0.01,0.1,1,2]}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}
classifier_param = [gb_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid]

In [None]:
GradientBoostingClassifier().get_params().keys()

In [None]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["GradientBoostingClassifier","SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")

In [None]:
votingC = VotingClassifier(estimators = [('svc',best_estimators[1]),
                                        ("rfc",best_estimators[2]),
                                        ("knn",best_estimators[4])],
                                        voting = "hard", n_jobs = -1, weights=[2, 1, 1])
votingC = votingC.fit(X_train, y_train)
print(accuracy_score(votingC.predict(X_test),y_test))