# Water Quality

## Drinking water potability

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import style
import missingno as msno
import seaborn as sns
from collections import Counter

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from xgboost.sklearn import XGBClassifier

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
print(df.head(10))

In [None]:
print(df.shape)

### Information about the data

In [None]:
print(df.info())

In [None]:
unique_count = []

for col in df.columns:
    unique_count.append(len(df[col].unique()))

print(pd.Series(unique_count, index = df.columns))

- All 10 variables of the data are **numerical**. 
- The **target variable takes binary values** 0 and 1. 
- The **feature variables are real numbers**.

## Exploratory Data Analysis

In [None]:
print(df.describe(include='all'))

### Looking at the target distribution

In [None]:
x = df.Potability.value_counts()
labels = [0, 1]
print(x)

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16,6))

ax[0].pie(x, 
        labels = labels,
        autopct = '%1.1f%%',
        colors=['orange', 'steelblue'], 
        explode = [0.005]*len(labels),
        textprops={'size': 'x-large'},
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'})

ax[1].bar(labels,height=x,color=['orange', 'steelblue'])
ax[1].set_xlabel('Potability')
ax[1].set_ylabel('Count')
ax[1].set_xticks([0, 1])

plt.show()

- 1998 data with Potability=1
- 1278 with Potability=0. 

Hence we conclude that the **data is imbalanced**.

### Correlation between different features

In [None]:
cor_mat = df.corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True)

Inferences from heat map
- shows absence of multicollinearity.
- most of the features are negatively correlated with Potability

### Violin Plot & Box Plot

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=9, figsize=(14, 42))

features = list(df.columns.drop('Potability'))
target = 'Potability'
idx = 0

for col in features:
    sns.violinplot(data=df, y=col, x=target, ax=ax[idx, 0],
                   inner='quartile', color='pink')
    
    sns.boxplot(data=df, y=col, x=target, ax=ax[idx, 1],
                palette=('orange', 'steelblue'))
    
    idx += 1
plt.show()

In [None]:
df.drop('Potability', axis=1).hist(bins=10, figsize=(20, 20))
plt.show()

## Handling missing data

In [None]:
msno.matrix(df, color=(0, 0, 0))

Let's look at percent missing values

In [None]:
df.isnull().sum() / 2620 * 100

- ph feature have almost 15% of data missing.
- Sulfate feature have almost 24% of data missing.
- Trihalomethanes feature have almost 5% missing data.

## Preparing data for model 

In [None]:
phMean_0 = df[df['Potability'] == 0]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['ph'].isna()), 'ph'] = phMean_0
phMean_1 = df[df['Potability'] == 1]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['ph'].isna()), 'ph'] = phMean_1

SulfateMean_0 = df[df['Potability'] == 0]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Sulfate'].isna()), 'Sulfate'] = SulfateMean_0
SulfateMean_1 = df[df['Potability'] == 1]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Sulfate'].isna()), 'Sulfate'] = SulfateMean_1

TrihalomethanesMean_0 = df[df['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_0
TrihalomethanesMean_1 = df[df['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_1

## Splitting the dataset into Training set and Testing set

In [None]:
X = df.drop('Potability', axis=1)
y = df.Potability

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

In [None]:
print('Balancing the data by SMOTE - Oversampling of Minority level\n')
smt = SMOTE()

counter = Counter(y_train)
print('Before SMOTE', counter)

X_train, y_train = smt.fit_resample(X_train, y_train)

counter = Counter(y_train)
print('\nAfter SMOTE', counter)

In [None]:
models = [LogisticRegression(), LinearSVC(), SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(),
          DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]
model_names = ['LogistivRegression', 'LinearSVM', 'rbfSVM', 'KNearestNeighbors', 'RandomForestClassifier', 'DecisionTree',
               'GradientBoostingClassifier', 'GaussianNB']

acc = []

for model in range(len(models)):
    clf = models[model]
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc.append(accuracy_score(pred, y_test))

models = {'Modelling Algo': model_names, 'Accuracy': acc}

In [None]:
models_df = pd.DataFrame(models)

In [None]:
models_df

In [None]:
sns.barplot(y='Modelling Algo', x='Accuracy', data=models_df)

## Feature Scaling

In [None]:
def feature_scaling(X_train, X_test, y_train, y_test, name_scaler):
    models = [LogisticRegression(), LinearSVC(), SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(),
              DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]

    acc_sc = []
    for model in range(len(models)):
        clf = models[model]
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc_sc.append(accuracy_score(pred, y_test))

    models_df[name_scaler] = np.array(acc_sc)

In [None]:
scalers = [MinMaxScaler(), StandardScaler()]
names = ['Acc_Min_Max_Scaler', 'Acc_Standard_Scaler']
for scale in range(len(scalers)):
    scaler = scalers[scale]

    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    feature_scaling(scaled_X_train, scaled_X_test, y_train, y_test, names[scale])

In [None]:
models_df

In [None]:
ssc = StandardScaler()

X_train = ssc.fit_transform(X_train)
X_test = ssc.transform(X_test)

In [None]:
sns.barplot(y='Modelling Algo', x='Accuracy', data=models_df)

In [None]:
sns.barplot(y='Modelling Algo', x='Acc_Min_Max_Scaler', data=models_df)

In [None]:
sns.barplot(y='Modelling Algo', x='Acc_Standard_Scaler', data=models_df)

## Parameter Tuning and Model Selection

In [None]:
model, test_accuracy = [], []

### LogisticRegression

In [None]:
param_grid = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}

logreg_clf = GridSearchCV(LogisticRegression(), param_grid, scoring='accuracy', cv=10)
logreg_clf.fit(X_train, y_train)

In [None]:
logreg_clf.best_params_

In [None]:
logreg_clf.best_score_

In [None]:
pred = logreg_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('LogisticRegression')
test_accuracy.append(accuracy)
print("Logistic Regression Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### SVM

In [None]:
param_grid = {'C': [0.98, 1.0, 1.2, 1.5, 2.0, 5.0], 
              'gamma': [0.50, 0.60, 0.70, 0.80, 0.90, 1.00], 
              'kernel': ['linear', 'rbf']}

svm_clf = GridSearchCV(SVC(), param_grid, scoring='accuracy', cv=10)
svm_clf.fit(X_train, y_train)

In [None]:
svm_clf.best_params_

In [None]:
svm_clf.best_score_

In [None]:
pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('SVM')
test_accuracy.append(accuracy)
print("SVM Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### KNN

In [None]:
param_grid = {'n_neighbors': [i+1 for i in range(50)], 'n_jobs': [-1]}

knn_clf = GridSearchCV(KNeighborsClassifier(),param_grid, scoring='accuracy', cv=10)
knn_clf.fit(X_train, y_train)

In [None]:
knn_clf.best_params_

In [None]:
knn_clf.best_score_

In [None]:
pred = knn_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('KNN')
test_accuracy.append(accuracy)
print("KNN Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### Decision Tree

In [None]:
param_grid = {'criterion': ['gini', 'entropy'], 
              'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120, 150]}

dt_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy', cv=10)
dt_clf.fit(X_train, y_train)

In [None]:
dt_clf.best_params_

In [None]:
dt_clf.best_score_

In [None]:
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Decision Tree')
test_accuracy.append(accuracy)
print("Decision Tree Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### Random Forest Classifier

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_features': ['auto', 'sqrt', 'log2']}

rf_clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, scoring='accuracy', cv=10)
rf_clf.fit(X_train, y_train)

In [None]:
rf_clf.best_params_

In [None]:
rf_clf.best_score_

In [None]:
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Random Forest')
test_accuracy.append(accuracy)
print("Random Forest Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### Gradient Boosting

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500, 600, 700]}

gb_clf = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10)
gb_clf.fit(X_train, y_train)

In [None]:
gb_clf.best_params_

In [None]:
gb_clf.best_score_

In [None]:
pred = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Gradient Boosting')
test_accuracy.append(accuracy)
print("Gradient Boosting Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### Adaboost Classifier

In [None]:
param_dict = {'n_estimators': list(range(1, 201, 20))}

adaboost_clf = GridSearchCV(
    AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=1000),  algorithm='SAMME.R'),
    param_grid)
adaboost_clf.fit(X_train, y_train)

In [None]:
adaboost_clf.best_params_

In [None]:
adaboost_clf.best_score_

In [None]:
pred = adaboost_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Adaboost')
test_accuracy.append(accuracy)
print("Adaboost Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, pred))

### XGBoost

In [None]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500, 600, 700], 'eval_metric': ['mlogloss']}

xg_boost = GridSearchCV(XGBClassifier(), param_grid)
xg_boost.fit(X_train, y_train)

In [None]:
xg_boost.best_params_

In [None]:
xg_boost.best_score_

In [None]:
pred = xg_boost.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('XGBoost')
test_accuracy.append(accuracy)
print("XGBoost Accuracy:", accuracy)

## Final Evaluation

In [None]:
evalutation = pd.DataFrame({
    'Model': model,
    'Accuracy': test_accuracy
})

In [None]:
evalutation

We get Random Forest, XGBoost and Gradient Boosting Algorithm get accuracy more than 75%

## Upvote if you learned from it!