In this notebook I study if we can categorize water as potable or non-potable given the  data at hand. I use 3 classification models: Logistic regression, Decision trees and Random forest. I also study the discrepancies between deleting missing values and inputing them with the mean. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/water-potability/water_potability.csv')
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

There is a lot of missiing values. I am going to remove and to replace the values with the mean and check for differences with one of the models.

In [None]:
data_clean= data.dropna()
data_clean.reset_index(drop=True, inplace=True)
data_clean2=data.fillna(data.mean())
data_clean.shape

In [None]:
# Checking for outliers
q25, q50, q75 = np.percentile(data_clean, [25, 50, 75] )
iqr = q75 - q25
mini = q25 - 1.5*(iqr)
maxi = q75 + 1.5*(iqr)
for cols in data_clean.columns :
    out=[x for x in data_clean[cols] if x > maxi]
    print(len(out))
for cols in data_clean.columns :
    out=[x for x in data_clean[cols] if x < mini]
    print(len(out))

The third column shows outliers equals to the number of observations and as shown on the distributions bellow there are no outliers so I am going to ignore them.

In [None]:
one = data[data['Potability']== 1]
zero = data[data['Potability']== 0]
one.shape , zero.shape

In [None]:
# Definition to make a sublot of histograms
def hist_plots( data: pd.DataFrame,
               rows: int,
              cols: int,
              figsize: tuple):
    fig, axes = plt.subplots(rows,cols, figsize=figsize)
    for i, ax in enumerate(axes.flatten()):
        if i < len(data.columns):
            data[sorted(data.columns)[i]].plot.hist(bins=30, ax=ax)
            ax.set_title(f'{sorted(data.columns)[i]} distribution', fontsize=10)
            ax.tick_params(axis='x', labelsize=10)
            ax.tick_params(axis='y', labelsize=10)
            ax.get_yaxis().get_label().set_visible(False)
        else:
            fig.delaxes(ax=ax)
    fig.tight_layout()

In [None]:
hist_plots(data=data_clean,
          rows=3,
          cols=4,
          figsize=(20,10))

In [None]:
hist_plots(data=data_clean2,
          rows=3,
          cols=4,
          figsize=(20,10))

In [None]:
cols = data.columns[:-1]
corr_values = data[cols].corr()

# Simplify by emptying all the data below the diagonal
tril_index = np.tril_indices_from(corr_values)

# Make the unused values NaNs
for coord in zip(*tril_index):
    corr_values.iloc[coord[0], coord[1]] = np.NaN
    
# Stack the data and convert to a data frame
corr_values = (corr_values
               .stack()
               .to_frame()
               .reset_index()
               .rename(columns={'level_0':'feature1',
                                'level_1':'feature2',
                                0:'correlation'}))

# Get the absolute values for sorting
corr_values['abs_correlation'] = corr_values.correlation.abs()

In [None]:
# Correlation distribution
sns.set_context('talk')
sns.set_style('white')

ax = corr_values.abs_correlation.hist(bins=20, figsize=(12, 8))
ax.set(xlabel='Absolute Correlation', ylabel='Frequency');

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
cols= data_clean.columns[:-1]
# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=3, 
                                          test_size=0.3, 
                                          random_state=42)

train_idx, test_idx = next(strat_shuf_split.split(data_clean[cols], data_clean['Potability']))

# Create the dataframes
X_train = data_clean.loc[train_idx, cols]
y_train = data_clean.loc[train_idx, 'Potability']

X_test  = data_clean.loc[test_idx, cols]
y_test  = data_clean.loc[test_idx, 'Potability']

X_train2 = data_clean2.loc[train_idx, cols]
y_train2 = data_clean2.loc[train_idx, 'Potability']

X_test2  = data_clean2.loc[test_idx, cols]
y_test2  = data_clean2.loc[test_idx, 'Potability']

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Scaling features
mm = MinMaxScaler() 
X_train= mm.fit_transform(X_train) 
X_test= mm.transform(X_test)

The models hyperparameters are tuned with GridSearchCV

# # Model 1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Tune hyperparameters
params = {"C":np.logspace(-3,3,20), "penalty":["l1","l2"]}
grid = GridSearchCV(LogisticRegression(solver='liblinear'), params)

#Fit to thw train data
GR = grid.fit(X_train, y_train)
GR.best_params_ ,GR.best_score_

In [None]:
y_pred=grid.predict(X_test)
y_prob=grid.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

cr = classification_report(y_test, y_pred)
print(cr)

auc= roc_auc_score(y_test, y_pred),
                        
print('auc: ', auc)


# # Model 2 Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
param_grid = {'max_depth':range(1,300, 2),
              'max_features': range(1, len(cols))}

GR2 = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring='accuracy',
                  n_jobs=-1)

GR2 = GR2.fit(X_train, y_train)
GR2.best_params_ ,GR2.best_score_, GR2.best_estimator_.tree_.node_count

In [None]:
y_pred_gr2 = GR2.predict(X_test)

In [None]:
cr2 = classification_report(y_test, y_pred_gr2)
print(cr2)

auc2= roc_auc_score(y_test, y_pred_gr2),
                        
print('auc: ', auc2)

# # Model 3 Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {'n_estimators':range(1,400, 2)}
RF = RandomForestClassifier(oob_score=True, 
                            random_state=42, 
                            warm_start=True,
                            n_jobs=-1)
rf = GridSearchCV(RF,
                  param_grid=param_grid)

rf = rf.fit(X_train, y_train)
rf.best_params_ ,rf.best_score_

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
cr3 = classification_report(y_test, y_pred_rf)
print(cr3)

auc3= roc_auc_score(y_test, y_pred_rf),
                        
print('auc: ', auc3)

In [None]:
param_grid = {'n_estimators':range(1,400, 2)}
RF = RandomForestClassifier(oob_score=True, 
                            random_state=42, 
                            warm_start=True,
                            n_jobs=-1)
rf = GridSearchCV(RF,
                  param_grid=param_grid)
rf2 = rf.fit(X_train2, y_train2)
rf2.best_params_ ,rf2.best_score_

In [None]:
y_pred_rf2 = rf2.predict(X_test2)

In [None]:
cr4 = classification_report(y_test2, y_pred_rf2)
print(cr4)

auc4= roc_auc_score(y_test2, y_pred_rf2),
                        
print('auc: ', auc4)