In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Describing the dataset

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe(include = "all")

In [None]:
def describe_dataframe(df=pd.DataFrame()):
    print("\n\n")
    print("*"*30)
    print("Data Description")
    print("*"*30)
    
    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
    

In [None]:
describe_dataframe(df)

# Find the missing values and impute them

In [None]:
df.isnull().sum()/len(df)

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
df['Potability'].value_counts(dropna=False)

In [None]:
!pip install ptitprince

In [None]:
from ptitprince import PtitPrince as pt
import seaborn as sns

In [None]:
dx = df.columns
dy = df.Potability


# A ‘raincloud’ plot, which combines boxplots, raw jittered data, and a split-half violin. It is much more inituitive to understand the distribution of the data

In [None]:
ort="h"; pal = sns.color_palette(n_colors=1)

for x in dx:
    pal = "Set2"
    f, ax = plt.subplots(figsize=(12, 5))

    ax=pt.half_violinplot( x = x, y = dy, data = df, palette = pal, bw = .2, cut = 0.,
                      scale = "area", width = .6, inner = None, orient = ort)
    ax=sns.stripplot( x = x, y = dy, data = df, palette = pal, edgecolor = "white",
                 size = 3, jitter = 1, zorder = 0, orient = ort)
    ax=sns.boxplot( x = x, y = dy, data = df, color = "black", width = .15, zorder = 10,\
            showcaps = True, boxprops = {'facecolor':'none', "zorder":10},\
            showfliers=True, whiskerprops = {'linewidth':2, "zorder":10},\
               saturation = 1, orient = ort)
    plt.title(x)



# Interpreting Skewness & Kurtosis

If the skewness is between -0.5 and 0.5, the data are fairly symmetrical
If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed
If the skewness is less than -1 or greater than 1, the data are highly skewed

A normal distribution has kurtosis exactly 3. Any distribution with kurtosis ≈3 (excess ≈0) is called mesokurtic.
A distribution with kurtosis <3 is called platykurtic. Compared to a normal distribution, its tails are shorter and thinner, and often its central peak is lower and broader.
A distribution with kurtosis >3 is called leptokurtic. Compared to a normal distribution, its tails are longer and fatter, and often its central peak is higher and sharper


In [None]:
skew = pd.Series(df.skew(),name="skew")
kurtosis = pd.Series(df.kurtosis(),name="kurtosis")
pd.concat([skew,kurtosis],axis =1)

# Time to split the dataset to train & test

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["Potability"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [None]:
df['Potability'].value_counts()/len(df)

In [None]:
strat_test_set['Potability'].value_counts()/len(strat_test_set)

In [None]:
df_copy =strat_train_set.copy()

# Imputation to be done only after split

In [None]:
#to_impute = ['ph','Sulfate','Trihalomethanes']
#df_copy[to_impute]=df_copy[to_impute].fillna((df_copy[to_impute].median()))

# Better to handle it in Sklearn pipeline 

# EDA Starts

First, let's check the correlation matrix. The correlation levels between the IVs are very low

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df_copy.corr(),annot = True,fmt='.1g',cmap= 'coolwarm',linewidths=1, linecolor='black')

In [None]:
import scipy.stats as stats

In [None]:
x = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [None]:
df_copy.columns

In [None]:
train_x = df_copy.drop("Potability",axis = 1)
train_Y = df_copy["Potability"].copy()

In [None]:
test_x = strat_test_set.drop("Potability",axis = 1)
test_Y = strat_test_set["Potability"].copy()

In [None]:
full_pipeline = Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('std_scaler',StandardScaler())
])

In [None]:
train_final = full_pipeline.fit_transform(train_x)

In [None]:
train_final[0]

In [None]:
test_final = full_pipeline.fit_transform(test_x)

# Create a new dataset post transformation to do some EDA

In [None]:
x_imputed_train = pd.DataFrame(train_final, columns = train_x.columns)

In [None]:
train_Y = train_Y.to_frame()

In [None]:
x_imputed_train.describe().T

In [None]:
train_Y.columns

In [None]:
df_train = x_imputed_train.merge(train_Y, left_index=True, right_index=True, how='inner')

# Point Biserial correlation is appropriate for Continuous vs Categorical value

In [None]:
for i in x:
    corr = stats.pointbiserialr(x_imputed_train[i], train_Y['Potability'])
    print(corr)

# Predictive Power Score
Predictive Power Score is another interesting way to analyse the relationship between variables. It is touted to be better than correlations

In [None]:
!pip install ppscore

In [None]:
import ppscore as pps

In [None]:
pps.matrix(df_train)

In [None]:
def heatmap(df):
    df = df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
    ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
    ax.set_title("PPS matrix")
    ax.set_xlabel("feature")
    ax.set_ylabel("target")
    return ax

In [None]:
matrix = pps.matrix(df_train)

# Overall it seems like the predictors that are available are very weak to predict the Water Potability

In [None]:
heatmap(matrix)

#Modelling, Hyperparameter Tuning and Feature importances

In [None]:
# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [None]:
X_train = x_imputed_train
X_test  = test_final
y_train = train_Y
y_test  = test_Y

In [None]:
# Put models in a dictionary
models = {"Logistic Regression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier()}

# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of differetn Scikit-Learn machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)

model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

Now we've got a baseline model... and we know a model's first predictions aren't always what we should based our next steps off. What should we do?

Let's look at the following:

Hypyterparameter tuning
Feature importance
Confusion matrix
Cross-validation
Precision
Recall
F1 score
Classification report
ROC curve
Area under the curve (AUC)

In [None]:
# Create a hyperparameter grid for LogisticRegression
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# Tune LogisticRegression

np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_


In [None]:
rs_log_reg.score(X_test, y_test)


Let's do it for RF as well

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(), 
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

# Fit random hyperparameter search model for RandomForestClassifier()
rs_rf.fit(X_train, y_train)

In [None]:

# Find the best hyperparameters
rs_rf.best_params_

In [None]:

# Evaluate the randomized search RandomForestClassifier model
rs_rf.score(X_test, y_test)

In [None]:
rs_rf.score(X_test, y_test)

In [None]:
# Make predictions with tuned model
y_preds = rs_rf.predict(X_test)

In [None]:

# Plot ROC curve and calculate and calculate AUC metric
plot_roc_curve(rs_rf, X_test, y_test)

In [None]:
print(confusion_matrix(y_test, y_preds))


In [None]:
print(classification_report(y_test, y_preds))


In [None]:
importance = rs_rf.best_estimator_.feature_importances_
# summarize feature importance


In [None]:
a = train_x.columns

In [None]:
b = zip(a,importance)

In [None]:
mapped = set(b)
print(mapped)

In [None]:
f = pd.DataFrame(mapped,columns=['Feature','Importance'])


In [None]:
f.plot(x ='Feature', y='Importance', kind = 'bar')