In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf 
import pathlib
from sklearn.metrics import confusion_matrix ,accuracy_score

## 1- loading the dataset

In [None]:
data_apth = "../input/water-potability/water_potability.csv"

data = pd.read_csv(data_apth)

## 2- summarizing and cleaning the data 

In [None]:
data.head()

In [None]:
print("The number of examples: {}".format(len(data)))

In [None]:
data.isnull().sum()

In [None]:
print(data.info())

In [None]:
# plotting the output to see if the data is bananced or not.
# I will use accuracy and confusion matrix to evaluat the results 
print(data['Potability'].value_counts())
print("\n")
print("{:.2F}% of the data is from the Not potable class and {:.2F}% is from the potable class".format(1998/len(data) * 100,
                                                                                                       1278/len(data) * 100))
data['Potability'].plot(kind='hist')

In [None]:
data['Potability'] = data['Potability'].astype('category')
data.info()

In [None]:
#There are no duplicated rows
data.duplicated().sum()

## Hndling missing values

In [None]:
data.fillna(value=data.median(), inplace=True)

In [None]:
data.isnull().sum()

## 3- Split the dataset 

In [None]:
examples = data.drop('Potability', axis=1)
target = data['Potability']

In [None]:
x_train_full, x_test,  y_train_full, y_test = train_test_split(examples,
                                                               target,
                                                               shuffle=True, test_size=0.2)

x_train, x_val, y_train, y_val = train_test_split(x_train_full,
                                                  y_train_full,
                                                  shuffle=True, test_size=0.2)

print("Training data shape: {}".format(x_train.shape))
print("Validation data shape: {}".format(x_val.shape))
print("Testing data shape: {}".format(x_test.shape))

## 4-  Input pipeline

In [None]:
def preprocessing(data, label):
    
    """function that takes the data and labels, 
    standarize the data and return the standarized
    data and the labels"""
    
    mean = tf.reduce_mean(data, axis=0)
    std = tf.math.reduce_std(data, axis=0)
    
    preprocessed_data = (data - mean) / std
    
    return preprocessed_data, label

In [None]:
def get_batches(x, y, buffer_size, batch_size, shuffle=False):
    
    #Building a pipeline from a data that exists in memory
    data_ds = tf.data.Dataset.from_tensor_slices((x, y))
    #mapping the fuction we alredy made to the data
    preprocessed_data = data_ds.map(preprocessing)
    
    
    if shuffle:
        # if we want to shuffle the dataset given a buffer size
        shuffled_data = preprocessed_data.shuffle(buffer_size)
        data_batches = shuffled_data.batch(batch_size).prefetch(1)
        
    else: 
        
        data_batches = preprocessed_data.batch(batch_size).prefetch(1)
    
    
    return data_batches

In [None]:
training_batches = get_batches(x_train, y_train, 1024, 64, shuffle=True)
validation_batches = get_batches(x_val, y_val, 256, 64, shuffle=False)
testing_batches = get_batches(x_test, y_test, 256, 64, shuffle=False)

## 5- Creating and training a model

In [None]:
# learning rate scheduling
def exponential_decay(lr_0, s):
    def exponential_decay_func(epoch):
        return lr_0 * 0.1 ** (epoch / s)
    return exponential_decay_func

exponential_decay_func = exponential_decay(lr_0=0.01, s=20)

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_func)

In [None]:
def leaky_relu(x):
    return tf.maximum(0.01*x, x)

In [None]:
model_input = tf.keras.Input(shape=(9,))

x = tf.keras.layers.Dense(256,
                          kernel_regularizer=tf.keras.regularizers.L2(),
                          kernel_initializer='he_normal')(model_input)

x = tf.keras.layers.Lambda(leaky_relu)(x)

x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Dense(128,
                          kernel_regularizer=tf.keras.regularizers.L2(),
                          kernel_initializer='he_normal')(x)

x = tf.keras.layers.Lambda(leaky_relu)(x)

x = tf.keras.layers.BatchNormalization()(x)

model_output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=model_input, outputs=model_output)

In [None]:
model.compile(loss='binary_crossentropy',
             optimizer=tf.keras.optimizers.RMSprop(),
             metrics=['acc'])

model.fit(training_batches, 
         epochs=500, 
         validation_data=validation_batches,
         callbacks=[lr_scheduler])

In [None]:
model.evaluate(testing_batches)

## Binning Data

In [None]:
def from_continuous_to_category(datafrmae, num_bins):
    
    features = list(datafrmae.columns)
    features_list = []
    
    for num, column in enumerate(features):
        feature = datafrmae[column]
        
        bins = np.linspace(feature.min(), feature.max(), num_bins)
        which_bin = np.digitize(feature, bins=bins)
        
        encoder = OneHotEncoder(sparse=False)
        encoder.fit(which_bin.reshape(-1, 1))
        x_binned = encoder.transform(which_bin.reshape(-1, 1))
        
        df = pd.DataFrame(x_binned)
        features_list.append(df)
        
    return pd.concat(features_list, axis=1)

In [None]:
x_train_binned = from_continuous_to_category(x_train, 5)
x_val_binned = from_continuous_to_category(x_val, 5)
x_test_binned = from_continuous_to_category(x_test, 5)

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Dense(256, activation='relu', input_shape=(x_train_binned.shape[1],),
                               kernel_regularizer=tf.keras.regularizers.L2()))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(128, activation='relu',
                               kernel_regularizer=tf.keras.regularizers.L2()))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])


model.fit(x_train_binned,  y_train,
         epochs=10, 
         validation_data=(x_val_binned, y_val))

In [None]:
model.evaluate(x_test_binned, y_test)

## sklearn scalling, PolynomialFeatures

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('poly', PolynomialFeatures()),
               ('scaling', StandardScaler())])

pipe.fit(x_train)

x_train_ready = pipe.transform(x_train)
x_val_ready = pipe.transform(x_val)
x_test_ready = pipe.transform(x_test)

In [None]:
class stop_training(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('val_acc')>0.69):
            print("\nReached 69% accuracy so canceling training!")
            self.model.stop_training = True

my_callbacks = stop_training()


early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                                  patience=5,
                                                  restore_best_weights=True)

In [None]:
model = tf.keras.models.Sequential()

        
model.add(tf.keras.layers.Dense(512, activation='relu',
                               kernel_regularizer=tf.keras.regularizers.L2()))
model.add(tf.keras.layers.BatchNormalization())
          
model.add(tf.keras.layers.Dense(256, activation='relu',
                               kernel_regularizer=tf.keras.regularizers.L2()))
model.add(tf.keras.layers.BatchNormalization())
          
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

In [None]:
model.fit(x_train_ready,  y_train,
         epochs=5, 
         validation_data=(x_val_ready, y_val),
         callbacks=[early_stopping, my_callbacks])

In [None]:
model.evaluate(x_test_ready, y_test)