# What is Potabible water

At its most basic level, potabible water relates to the safety of water. 

Many questions begin to emerge.
* Are we able to consume all fresh water types?
* What percentage of the worlds fresh water can be accessed?
* Has the water table increased as sea levels have rised?

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# EDA

In [None]:
# Import the dataset for review as a DataFrame
df = pd.read_csv("../input/water-potability/water_potability.csv")

In [None]:
# Review the first 5 observations
df.head()

In [None]:
# Display information about the DataFrame
df.info(memory_usage="deep")

In [None]:
# Shape of the dataframe
print(df.shape)
# Find the number of rows within a dataframe
print(len(df))
# Extracting information from the shape tuple
print(f'Number of rows: {df.shape[0]} \nNumber of columns: {df.shape[1]}')

### 1a. Summary statistics

In [None]:
# Review the high level summary details for each variable
df.describe()

### 1b. Missing values

In [None]:
# Check for the missing values by columns
df.isnull().sum()

In [None]:
# Proportion of missing values by column
def isnull_prop(df):
    total_rows = df.shape[0]
    missing_val_dict = {}
    for col in df.columns:
        missing_val_dict[col] = [df[col].isnull().sum(), (df[col].isnull().sum() / total_rows)]
    return missing_val_dict

# Apply the missing value method
null_dict = isnull_prop(df)
print(null_dict.items())

In [None]:
# Create a dataframe of the missing value information
df_missing = pd.DataFrame.from_dict(null_dict, orient="index", columns=['missing', 'miss_percent'])
df_missing

In [None]:
# Display missing values using a heatmap to understand if any patterns are present
plt.figure(figsize=(15,8))
sns.heatmap(df.isnull());

In [None]:
# set the histogram, mean and median
sns.displot(df["ph"], kde=False)
plt.axvline(x=df.ph.mean(), linewidth=3, color='g', label="mean", alpha=0.5)
plt.axvline(x=df.ph.median(), linewidth=3, color='y', label="median", alpha=0.5)

# set title, legends and labels
plt.xlabel("ph")
plt.ylabel("Count")
plt.title("Distribution of ph", size=14)
plt.legend(["mean", "median"]);

print(f'Mean pH value {df.ph.mean()} \n Median pH value {df.ph.median()} \n Min pH value {df.ph.min()} \n Max pH value {df.ph.max()}')

Do these values of pH relate to actual water or are there a wider range of sources being supplied?
![pH scale](https://www.scienceabc.com/wp-content/uploads/2019/07/A-pH-scale-on-white-background-illustration-VectorBlueRingMedias.jpg)

# Predict Potability

In [None]:
# Preprocessing
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Performance metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [None]:
# Apply mean value to the missing values
df['ph'].fillna(df['ph'].mean(), inplace=True)
df['Sulfate'].fillna(df['Sulfate'].mean(), inplace=True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)
df.isnull().sum()

In [None]:
# Separate into X and y variables
X = df.drop(['Potability'], axis=1)
y = df['Potability'].values

In [None]:
# Display the features
X.head()

In [None]:
# Does scaling the features change the dynamics
X_scaled = scale(X)

# Print the mean and standard deviation of the unscaled features
print("Mean of Unscaled Features: {}".format(np.mean(X))) 
print("Standard Deviation of Unscaled Features: {}".format(np.std(X)))

# Print the mean and standard deviation of the scaled features
print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) 
print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))

In [None]:
# k-NN classifier

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=2, stratify=y)

# Create a k-NN classifier with 7 neighbors
knn = KNeighborsClassifier(n_neighbors=7)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Print the accuracy
print(knn.score(X_test, y_test))

In [None]:
# Lets understand the performance of the k-NN classifer across a range of clusters
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 12)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
# Setup the pipeline steps
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
        
# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training set
knn_scaled = pipeline.fit(X_train, y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))

In [None]:
# Decision Tree classifier
# Setup the parameters and distributions to sample
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
class ModelBuild():
    # Constructor
    def __init__(self, X, y, model=DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=8)):
        self.X = X
        self.y = y
        self.model = model
    
    # Method to perform the train test split
    def _train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=42)
        return X_train, X_test, y_train, y_test
    
    # Method to set the pipeline
    def _pipeline(self):
        steps = [('scaler', StandardScaler()),
                 ('model_name', self.model)]
        return Pipeline(steps)
    
    # Method to run all steps
    def model_build(self):
        if __name__ == "__main__":
            X_train, X_test, y_train, y_test = self._train_test_split()
            pipeline = self._pipeline()
            fit = pipeline.fit(X_train, y_train)
            return print("Accuracy: {}".format(pipeline.score(X_test, y_test)))

In [None]:
ModelBuild(X, y).model_build()

In [None]:
class FeatureSelection(ModelBuild):
    
    # Inherit the ModelBuild features
    def __init__(self, X, y, model=RandomForestClassifier()):
        super().__init__(X, y, model=RandomForestClassifier())
        self.X = X
        self.y = y
        self.model = model
    
    # Method to evaluate list of models
    def rfe_model(self):
        model_dict = dict()
        for i in range(2, len(self.X.columns)):
            rfe = RFE(estimator=self.model, n_features_to_select=i)
            model = DecisionTreeClassifier()
            model_dict[str(i)] = Pipeline(steps=[('rfe', rfe), ('mod', model)])
        return model_dict
    
    # Method to evaluate the models
    def eval_model(self, model):
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=6)
        scores = cross_val_score(model, self.X, self.y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        return scores
    
    # Lets understand the features being selected
    def feature_select(self, n_feature):
        rfe = RFE(estimator=self.model, n_features_to_select=n_feature)
        rfe.fit(self.X, self.y)
#         for i in range(X.shape[1]):
        for i, col in enumerate(X.columns):
            print('Column: %s, Selected %s, Rank: %.3f' % (col, rfe.support_[i], rfe.ranking_[i]))   
    
    # Method to run all steps
    def feature_selection(self):
        if __name__ == "__main__":
            models = self.rfe_model()
            results, names = list(), list()
            for name, model in models.items():
                scores = self.eval_model(model)
                results.append(scores)
                names.append(name)
                print(f'{name}, mean_score: {np.mean(scores)}, std_score: {np.std(scores)}')
                box_plt = plt.boxplot(results, labels=names, showmeans=True)
            return box_plt

In [None]:
box = FeatureSelection(X, y, model=DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=8)).feature_selection()
plt.show()

In [None]:
features = FeatureSelection(X, y, model=DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=8)).feature_select(5)

In [None]:
# Lets try a Light GBM
from lightgbm import LGBMClassifier

In [None]:
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=2, stratify=y)

# Instantiate the LGBM
lgbm = LGBMClassifier()

# Fit the classifier to the training data
lgbm.fit(X_train, y_train)

# Perform prediction
y_pred = lgbm.predict(X_test)

# Print the accuracy
print(lgbm.score(X_test, y_test))

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Lets understand the baseline params
lgbm.get_params()

In [None]:
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('lgbm', LGBMClassifier())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {
    'lgbm__learning_rate':[0.03, 0.05, 0.1],
    'lgbm__objective':['binary'],
    'lgbm__metric':['binary_logloss'],
    'lgbm__max_depth':[10],
    'lgbm__n_estimators':[100, 200, 300]
}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the GridSearchCV object
cv = GridSearchCV(pipeline, parameters, cv=3)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = cv.predict(X_test)

In [None]:
# Display best score and params
print(f'Best score : {cv.best_score_}')
print(f'Best params : {cv.best_params_}')

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))