In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# EDA

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv") # reading dataset

In [None]:
df.head() # display first five rows

In [None]:
df.shape # the shape of our dataset

In [None]:
df.dtypes

In [None]:
df.hist(figsize = (20,10), layout = (3,4))
plt.show()

# Handling missing values

In [None]:
df.isnull().sum() # total number of NaN values in all columns

In [None]:
# Replace NaN values with medians of those columns
df['ph'] = df['ph'].fillna(df['ph'].median())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].median())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].median())

In [None]:
df.isnull().sum()

# Data visualization

In [None]:
values = df['Potability'].value_counts().to_list()
labels = df['Potability'].value_counts().index.to_list()
plt.pie(values, labels = labels, autopct = "%1.1f%%", explode = [0.05, 0.05], shadow = True, startangle = 120)
plt.show()

Here 1 means water is potable(safe for human consumption) and 0 means water is not potable

In [None]:
fig = plt.figure()
fig.suptitle("Distribution Plots", fontsize = 25)
fig.subplots_adjust(wspace = 0.2, hspace = 0.3)
for i,x in enumerate(df.columns):
    ax = fig.add_subplot(4,3,i+1)
    fig.set_figheight(20)
    fig.set_figwidth(20)
    sns.distplot(df[x], hist = False, color = 'violet', kde_kws = {'shade': True})
plt.show()

Since the solids graph is a little bit skewed , we will apply a transformation to fix it.


In [None]:
df['Solids'] = np.power(df['Solids'], 1/2)

In [None]:
sns.distplot(df['Solids'], hist = False, color = 'violet', kde_kws = {'shade' : True})

In [None]:
fig = plt.figure()
fig.suptitle("Violin Plots", fontsize = 25)
fig.subplots_adjust(wspace = 0.2, hspace = 0.3)
for i,x in enumerate(df.columns):
    ax = fig.add_subplot(4,3,i+1)
    fig.set_figheight(20)
    fig.set_figwidth(20)
    sns.violinplot(x = df['Potability'], y = df[x])
plt.show()

In some of the plots we can see that for potable water the distribution curve is more spread out (higher standard deviation), so we can observe from these plots that lesser values of sulfate, hardness and solids means more chance of water being potable.

Also in the pH plots we see that potable water has a higher peak, which means that most values of pH for drinkable water lie between 6 and 8.

In [None]:
sns.pairplot(df, hue = "Potability")

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.corr(), annot = True, cmap = "RdYlGn")

From the above **pairplot** and **correlation heatmap** we see that almost all of the columns have pearson correlation values less than **0.1** or greater than **-0.15** with one another, which is good for us as we do not have to deal with **Multicollinearity**

# Data Preprocessing

### Splitting data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Potability', axis = 1)
y = df['Potability']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3)
# This will create a training set which consists of 70% of the original dataset and testing set contains 30% data

In [None]:
xtrain.shape, ytrain.shape, xtest.shape, ytest.shape

### Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(xtrain, ytrain)
xtrain_scaled = scale.transform(xtrain)
xtest_scaled = scale.transform(xtest)

# Model Selection

**Checking just the accuracy of our model will not be important in this case. It is more important that we classify the water that is not drinkable properly, because if our model classifies non drinkable water as drinkable it will be very problematic.**

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(xtrain_scaled, ytrain)

In [None]:
yhat_logreg = logreg.predict(xtest_scaled)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(yhat_logreg, ytest), confusion_matrix(yhat_logreg, ytest),
      classification_report(yhat_logreg, ytest), sep = '\n\n')

In [None]:
logreg_score = accuracy_score(yhat_logreg, ytest)

The logistic regression is not classifying properly so we will try another model

### K-nearest nieghbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xtrain_scaled, ytrain)
yhat_knn = knn.predict(xtest_scaled)

In [None]:
print(accuracy_score(yhat_knn, ytest), confusion_matrix(yhat_knn, ytest),
      classification_report(yhat_knn, ytest), sep = '\n\n')

Lets see if changing some hyperparameters makes a postive change to our model.

In [None]:
train_score = []
test_score = []
for n in range(2,20,2):
    knn = KNeighborsClassifier(n_neighbors = n)
    knn.fit(xtrain_scaled, ytrain)
    train_score.append(knn.score(xtrain_scaled, ytrain))
    test_score.append(knn.score(xtest_scaled, ytest))
plt.plot(train_score, color = 'r', label = 'train score')
plt.plot(test_score,color = 'g', label = 'test_score')
plt.legend()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(xtrain_scaled, ytrain)
yhat_knn = knn.predict(xtest_scaled)

In [None]:
print(accuracy_score(yhat_logreg, ytest), confusion_matrix(yhat_logreg, ytest),
      classification_report(yhat_logreg, ytest), sep = '\n\n')

In [None]:
knn_score = accuracy_score(yhat_knn, ytest)

KNN model is better than logistic regression model, but it still does not make correct prediction about the class 1.

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(xtrain_scaled, ytrain)

In [None]:
yhat_dt = dt.predict(xtest_scaled)

In [None]:
print(accuracy_score(yhat_dt, ytest), confusion_matrix(yhat_dt, ytest),
      classification_report(yhat_dt, ytest),sep = '\n\n')

The decision tree model is a bit better , even though the accuracy score is lesser than the logreg model, it is atleast acknowledging both the classes.
In order to make this model better we will try tuning some hyperparameters like max_depth

In [None]:
depths = range(2,25)
train_score = []
test_score = []
for max_depth in depths:
    dtc = DecisionTreeClassifier(max_depth = max_depth)
    dtc.fit(xtrain_scaled, ytrain)
    train_score.append(dtc.score(xtrain_scaled, ytrain))
    test_score.append(dtc.score(xtest_scaled, ytest))
plt.plot(train_score)
plt.plot(test_score)

There are even more hyperparameters which we can tune to make the model better so instead of tuning them one by one we will use hyperparameter tuning using RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold # KFold for creating cross validation sets
dtc_grid = {'max_depth' : range(2,30),
           'min_samples_split' : range(2, 100, 7),
           'min_samples_leaf' : range(2, 100, 7)}
dtc = DecisionTreeClassifier()
dtc_rcv = RandomizedSearchCV(dtc, param_distributions = dtc_grid, cv = KFold(n_splits = 10), scoring = 'accuracy')

In [None]:
dtc_rcv.fit(xtrain_scaled, ytrain)

In [None]:
dtc_rcv.best_score_

In [None]:
dtc_rcv.best_params_

In [None]:
yhat_dtc = dtc_rcv.predict(xtest_scaled)

In [None]:
print(accuracy_score(yhat_dtc, ytest), confusion_matrix(yhat_dtc, ytest),
      classification_report(yhat_dtc, ytest), sep = '\n\n\n')

In [None]:
dtc_score = accuracy_score(yhat_dtc, ytest)

So these are the best hyperparameters for our dataset, we can see that classification was a bit better on this model but it is still not ideal.


### Support Vector Classifier

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(xtrain_scaled, ytrain)
yhat_svc = svc.predict(xtest_scaled)

In [None]:
print(accuracy_score(yhat_svc, ytest), confusion_matrix(yhat_svc, ytest),
      classification_report(yhat_svc, ytest), sep = '\n\n')

In [None]:
svc_score = accuracy_score(yhat_svc, ytest)

This has been the best predictor so far as it is classifying the non-drinkable water well and has a higher overall accuracy.

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(xtrain_scaled, ytrain)
rfc.score(xtest_scaled, ytest)

In [None]:
random_grid = {'n_estimators' : range(100,1000,100),
              'max_depth' : range(2,50,2),
              'min_samples_split' : range(2,100,7),
              'min_samples_leaf' : range(2,100,7)}
rfc_rcv = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, 
                             cv = KFold(n_splits = 5),scoring = 'accuracy', verbose = 2)

In [None]:
rfc_rcv.fit(xtrain_scaled, ytrain)

In [None]:
rfc_rcv.best_params_

In [None]:
yhat_rfc = rfc_rcv.predict(xtest_scaled)
print(accuracy_score(yhat_rfc, ytest), confusion_matrix(yhat_rfc, ytest),
      classification_report(yhat_rfc, ytest), sep = '\n\n')

In [None]:
rfc_score = accuracy_score(yhat_rfc, ytest)

In [None]:
scores = [logreg_score, dtc_score, svc_score, rfc_score, knn_score]
scores_df = pd.DataFrame(scores, index = ['Logistic Regression', 'Decision Tree', 'Support Vector Classifier', 'Random Forest', 'K-nearest neighbors'])
scores_df.rename(columns = {0 : 'Scores'}, inplace = True)

In [None]:
scores_df.plot(kind = 'barh')

So choosing the Support Vector Classifier or the Random Forest Classifier would be the best choice for our case.