Welcome! My name is Samuel Haines and I will be creating a model to predict whether or not a certain body of water is safe for human consumption using nine different features. The link to this dataset as well as a further description about the features is here https://www.kaggle.com/adityakadiwal/water-potability.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
import missingno as msno
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)

I will start by importing and examining the data. I'm going to drop all of the columns with missing values as I don't want to risk messing up the dataset by trying to fill them in with the mean/median.

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv') # import the data
df = df.rename(columns={'ph':'pH'}) # rename column one from ph to pH
df = df.dropna() # drops all columns with missing values
df.describe()

In [None]:
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
            color_discrete_sequence=[colors_green[3],colors_blue[3]],
             labels={'label':'Potability','Potability':'No. Of Samples'})
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

In [None]:
fig = px.scatter_matrix(df,df.drop('Potability',axis=1),height=1250,width=1250,template='plotly_white',opacity=0.7,
                        color_discrete_sequence=[colors_blue[3],colors_green[3]],color='Potability',
                       symbol='Potability',color_continuous_scale=[colors_green[3],colors_blue[3]])

fig.update_layout(font_family='monospace',font_size=10,
                  coloraxis_showscale=False,
                 legend=dict(x=0.02,y=1.07,bgcolor=colors_dark[4]),
                 title=dict(text='Scatter Plot Matrix b/w Features',x=0.5,y=0.97,
                   font=dict(color=colors_dark[2],size=24)))
fig.show()

Now we will set up the prediction target as well as the features.

In [None]:
y = df.Potability # makes Potability the target we want to predict

In [None]:
df_features = ['pH', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
               'Organic_carbon', 'Trihalomethanes', 'Turbidity']

In [None]:
X = df[df_features] # sets up the features
X.head()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

# split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=1)

I will start off by using a decision tree classifier. Initally, I will use the default model but then I will see how changing the max_leaf_nodes affects the accuracy.

In [None]:
# specify model
potability_model = DecisionTreeClassifier(random_state=1)
# fit model
potability_model.fit(train_X, train_y)

# make validation predictions and calculating accuracy
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy when not specifying max_leaf_nodes: {:.5}".format(val_acc))

# recalculating the accuracy while changing the max_leaf_nodes
potability_model = DecisionTreeClassifier(max_leaf_nodes=33, random_state=1)
potability_model.fit(train_X, train_y)
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy for best value of max_leaf_nodes: {:,.5}".format(val_acc))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False
    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

   

In [None]:
cf_matrix = confusion_matrix(val_y, val_predictions)
labels = ['True Neg.','False Pos.','False Neg.','True Pos.']
categories = ['Zero', 'One']
# creating confusion matrix using template from Dennis T.
make_confusion_matrix(cf_matrix, 
                      group_names=labels,
                      categories=categories, 
                      cmap='Blues')

So we can see that initally we got an accuracy at about 0.596 which isn't terrible for a first run but it's definitely not great. I messed around with the max_leaf_nodes and I was able to achieve an accuracy score of 0.700 score using 33 nodes which is shown above in the confusion matrix. That's a huge jump of around 10% accuracy just by changing one parameter. I'm very happy with that result but I'm going to test some other models to see if I can get any better results.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# specify model
potability_model = RandomForestClassifier(random_state=1)
# fit model
potability_model.fit(train_X, train_y)

# make validation predictions and calculating accuracy
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy: {:.5}".format(val_acc))

# recalculating the accuracy while changing the n_estimators
potability_model = RandomForestClassifier(n_estimators=30, random_state=1)
potability_model.fit(train_X, train_y)
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy for best value of n_estimators: {:,.5}".format(val_acc))

In [None]:
cf_matrix2 = confusion_matrix(val_y, val_predictions)
labels = ['True Neg.','False Pos.','False Neg.','True Pos.']
categories = ['Zero', 'One']
# creating confusion matrix using template from Dennis T.
make_confusion_matrix(cf_matrix2, 
                      group_names=labels,
                      categories=categories, 
                      cmap='Blues')

We now get the accuracy score up to 0.715! That's a nice 0.015 jump from where we began but I'm still not content. Let's try one more model and see how it lines up.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# specify model
potability_model = ExtraTreesClassifier(random_state=1)
# fit model
potability_model.fit(train_X, train_y)

# make validation predictions and calculating accuracy
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy: {:.5}".format(val_acc))

# recalculating the accuracy while changing the n_estimators
potability_model = ExtraTreesClassifier(n_estimators=87, random_state=1)
potability_model.fit(train_X, train_y)
val_predictions = potability_model.predict(val_X)
val_acc = accuracy_score(val_predictions, val_y)
print("Validation accuracy for best value of n_estimators: {:,.5}".format(val_acc))

In [None]:
cf_matrix3 = confusion_matrix(val_y, val_predictions)
labels = ['True Neg.','False Pos.','False Neg.','True Pos.']
categories = ['Zero', 'One']
# creating confusion matrix using template from Dennis T.
make_confusion_matrix(cf_matrix3, 
                      group_names=labels,
                      categories=categories, 
                      cmap='Blues')

We're now up to 0.732 using the ExtraTreesClassifier. I am very happy with this score as this dataset did not seem to have a lot of great features.