In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Library Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

## Load the dataset and validate the data load

In [None]:
waterpotability = pd.read_csv("../input/water-potability/water_potability.csv")

# Check the data load
waterpotability.head()

In [None]:
# Check the data structure
waterpotability.info()

In [None]:
# Check for missing value
waterpotability.isna().sum()

There are missing values in ph, sulfate and Trihalomethanes

## Exploratory Data Analysis

In [None]:
# Explore the target variable
sns.countplot(waterpotability['Potability'])
waterpotability['Potability'].value_counts()

In [None]:
# We will revist the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(waterpotability.corr(), annot=True)

The correlation between the attributes are fairly low and this suggests the attributes are independent

In [None]:
# Let us check the distribution of the attributes

fig = plt.figure(figsize=(22, 11))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
ax = fig.add_subplot(3, 3, 1)
sns.distplot(waterpotability['ph'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 2)
sns.distplot(waterpotability['Hardness'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 3)
sns.distplot(waterpotability['Solids'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 4)
sns.distplot(waterpotability['Chloramines'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 5)
sns.distplot(waterpotability['Sulfate'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 6)
sns.distplot(waterpotability['Conductivity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 7)
sns.distplot(waterpotability['Organic_carbon'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 8)
sns.distplot(waterpotability['Trihalomethanes'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 9)
sns.distplot(waterpotability['Turbidity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
plt.show()

All the distributions are fairly normal and distributed around the mean

In [None]:
# View the mean by the response variable

waterpotability.groupby('Potability').mean()

In [None]:
# View the stdDev by the response variable
waterpotability.groupby('Potability').std()

The mean and standard deviations are also fairly close to each other against the response variable

In [None]:
# Check the pair plots
sns.pairplot(data=waterpotability, hue='Potability')

## Impute Missing Values
The approach is to substitue mean of the bmi by the target variable - Potability

In [None]:
waterpotability['ph'] = waterpotability['ph'].fillna(waterpotability.groupby('Potability')['ph'].transform('mean'))
waterpotability['Sulfate'] = waterpotability['Sulfate'].fillna(waterpotability.groupby('Potability')['Sulfate'].transform('mean'))
waterpotability['Trihalomethanes'] = waterpotability['Trihalomethanes'].fillna(waterpotability.groupby('Potability')['Trihalomethanes'].transform('mean'))
waterpotability.isna().sum()

## Train Test Split and Scaling of Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = waterpotability.drop(labels=['Potability'], axis=1)
target = waterpotability['Potability']

features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.3, random_state=101,
                                                                           stratify = target)

In [None]:
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

## Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

### Random Forest Classifier

In [None]:
param_grid = {'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False], 'criterion':['entropy', 'gini']}
rfcgrid = GridSearchCV(RandomForestClassifier(random_state=101), param_grid, verbose=100, cv=10, n_jobs=-2)
rfcgrid.fit(features_train, target_train)

In [None]:
# Best params of Random Forest
rfcgrid.best_params_

In [None]:
rfcpredictions = rfcgrid.predict(features_test)

print("Confusion Matrix - Random Forest Using Entropy Index")
print(confusion_matrix(target_test,rfcpredictions))
print("\n")
print("Accuracy Score - Random Forest")
print(accuracy_score(target_test, rfcpredictions))
print("\n")
print("Classification Report - Random Forest")
print(classification_report(target_test,rfcpredictions))
print("\n")
print("F1 Score - Random Forest")
print(f1_score(target_test, rfcpredictions))