In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
import tensorflow as tf
import pandas_profiling as profiling
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline


In [None]:
X = df.drop(["Potability"], axis=1)
y=df["Potability"]


# splitting our data

X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=42)

In [None]:
# printing X Shape

print("Shape of X ", X.shape)

In [None]:
# printing shape of y
print("Shape of y", y.shape)

In [None]:
# printing out missing values for all columns
m_value = X.isnull().sum()
print(m_value[m_value > 0])

In [None]:
m_y_value = y.isnull().sum()
print(m_y_value[m_y_value > 0])

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(X_train.corr(), annot=True, cmap='YlGnBu')

In [None]:
sns.countplot(x='Potability', data=df)

In [None]:
fig= px.scatter_matrix(df,
                      dimensions=['ph', "Hardness", "Solids", "Turbidity"],
                      color="Potability")
fig.show()

In [None]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


In [None]:
from sklearn.impute import SimpleImputer
my_impute = SimpleImputer()
imputed_X_train = pd.DataFrame(my_impute.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_impute.transform(X_valid))

# putting back these column names
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("Mean absolute error for this model is: ")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

In [None]:
# pandas profiling
profiling.ProfileReport(imputed_X_train)




In [None]:
profiling.ProfileReport(imputed_X_valid)

In [None]:
# # comparing different models

# def score_model(model, X_t = imputed_X_train, X_v=imputed_X_valid, y_t=y_train, y_v=y_valid):
#     model.fit(X_t, y_t)
#     preds=model.predict(X_v)
#     return mean_absolute_error(y_v, preds)

# for i in range(0, len(models)):
#     mae= score_model(models[i])
#     print("Model %d MAE: %d" % (i+1, mae))
    
# takes too long to even run for 200 n_estimators

In [None]:
# using the dropping all null value columns

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# dropping columns in training and validation columns

reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("Mean absolute error for this model is: ")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

In [None]:
print(reduced_X_train.shape)

In [None]:
profiling.ProfileReport(reduced_X_train)

In [None]:
clf = RandomForestClassifier(max_depth=10, n_estimators=300, random_state=42)
clf.fit(reduced_X_train, y_train)
y_preds = clf.predict(reduced_X_valid)

rand_accur = accuracy_score(y_valid, y_preds)




In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.crosstab(y_valid, y_preds, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(confusion_matrix, annot=True)

print(f"Accuracy score is {rand_accur}")

In [None]:
# selecting our best features

feature_importance_df = pd.DataFrame({"feature": list(reduced_X_train.columns), "Importance": clf.feature_importances_}).sort_values("Importance", ascending=False)

feature_importance_df

In [None]:
# showing feature importance
sns.barplot(x=feature_importance_df.feature, y =feature_importance_df.Importance)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.xticks(rotation=45, horizontalalignment="right", fontweight="light", fontsize="x-large")

plt.show()

In [None]:
new_x_train = reduced_X_train[["Hardness", "Solids", "Chloramines", "Organic_carbon", "Turbidity", "Conductivity"]]



In [None]:
# acuracy score after selected best features

clf = RandomForestClassifier(max_depth=10, n_estimators=300, random_state=42)
clf.fit(new_x_train, y_train)
yy_preds = clf.predict(reduced_X_valid)

random_accur = accuracy_score(y_valid, yy_preds)

confusion_matrix = pd.crosstab(y_valid, yy_preds, rownames=["Actual"], colnames=["Predicted"])
sns.heatmap(confusion_matrix, annot=True)

print(f"Accuracy score is {random_accur}")

In [None]:
# Mean Absolute Error for the best features selection

print("Mean absolute error for this model is: ")
print(score_dataset(new_x_train, reduced_X_valid, y_train, y_valid))

In [None]:
rfc = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num =12)]

criterion = ['gini', 'entropy']

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(5,30, num=6)]

min_simples_split = [2,5,10.15,100]

min_samples_leaf = [1,2,3,10]


In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

random_grid = {'n_estimators': n_estimators,
              'criterion': criterion,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_leaf': min_samples_leaf
              }
print(random_grid)

In [None]:
rfc_random = RandomizedSearchCV(estimator = rfc,
                               param_distributions=random_grid,
                               scoring='neg_mean_squared_error',
                               n_iter = 10,
                               cv=5,
                               verbose=2,
                               random_state=42,
                               n_jobs=2)

In [None]:
from sklearn.metrics import accuracy_score
rfc = RandomForestClassifier(n_estimators = 590, criterion='gini', max_features='auto', max_depth=10, min_samples_leaf = 10)
rfc_random.fit(reduced_X_train,y_train)
y_prediction = rfc_random.predict(reduced_X_valid)
my_accuracy = accuracy_score(y_prediction, y_valid)
print("Accuracy score is", my_accuracy)


In [None]:
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=918, max_depth =25, max_features ='auto', criterion='entropy')
clf.fit(reduced_X_train, y_train)
y_pred = clf.predict(reduced_X_valid)

accur = accuracy_score(y_pred, y_valid)

print("Accuracy score is ", accur)

In [None]:
prediction = clf.predict(reduced_X_valid)

In [None]:
prediction

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_valid, y_prediction))

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
# using XGBOOST from Kaggle

my_model = XGBRegressor(n_estimators=500, learning_rate = 0.01)
my_model.fit(reduced_X_train, y_train,
            early_stopping_rounds=5,
            eval_set=[(reduced_X_valid, y_valid)],
            verbose=False)


In [None]:
xgpredictions = my_model.predict(reduced_X_valid)
mae_2 = mean_absolute_error(xgpredictions, y_valid)

print("Mean Absolute Error", mae_2)

In [None]:
new_x_valid = reduced_X_valid[["Hardness", "Solids", "Chloramines", "Organic_carbon", "Turbidity", "Conductivity"]]
xgpredictions= my_model.predict(new_x_valid)
mae3 = mean_absolute_error(xgpredictions, y_valid)
print("Mean Absolute Error", mae3)

In [None]:
pred_test =  my_model.predict(reduced_X_valid)

pred_test

In [None]:
pred_test_2 =  my_model.predict(new_x_valid)

pred_test_2

In [None]:
# now lets do something diferent to see if we can get better MAE

training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [None]:
m_value = training_data.isnull().sum()
print(m_value[m_value > 0])

In [None]:
m_value = testing_data.isnull().sum()
print(m_value[m_value > 0])

In [None]:
# from sklearn.impute import SimpleImputer
# my_impute_2 = SimpleImputer()
# imputed_X_training = pd.DataFrame(my_impute_2.fit_transform(training_data))
# imputed_testing = pd.DataFrame(my_impute_.transform(testing_data))

# # putting back these column names
# imputed_X_training.columns = training_data.columns
# imputed_testing.columns = testing_data.columns
# Calculate the Z-scores of each column in the training set:
train_df_mean = training_data.mean()
train_df_std = training_data.std()
train_df_norm = (training_data - train_df_mean)/train_df_std
# Calculate the Z-scores of each column in the test set.
test_df_mean = testing_data.mean()
test_df_std = testing_data.std()
test_df_norm = (testing_data - test_df_mean)/test_df_std

In [None]:
profiling.ProfileReport(train_df_norm)

In [None]:
# dropping off missing values

# using the dropping all null value columns

cols_with_missing_train = [col for col in train_df_norm.columns if train_df_norm[col].isnull().any()]
cols_with_missing_t = [col for col in test_df_norm.columns if test_df_norm[col].isnull().any()]

# dropping columns in training and validation columns

reduced_train_df_norm = train_df_norm.drop(cols_with_missing_train, axis=1)
reduced_test_df_norm= test_df_norm.drop(cols_with_missing_t, axis=1)

In [None]:
#checking for missing values
# now that we have all the missing values cleared we can proceed with the next step
m_value = reduced_train_df_norm.isnull().sum()
print(m_value[m_value > 0])

In [None]:
profiling.ProfileReport(reduced_train_df_norm)

In [None]:
feature_columns = []
# we hae scaled all the columsn, including Sulfate is highly correlated with Solids

resolution_in_Zs = 0.3 # 3/10 of a standard deviation# 

# create a bucket feature column for Solids

solids_numeric = tf.feature_column.numeric_column("Solids")
solids_boundaries = list(np.arange(int(min(reduced_train_df_norm["Solids"])),
                                  int(max(reduced_train_df_norm["Solids"])),
                                  resolution_in_Zs))
solids = tf.feature_column.bucketized_column(solids_numeric, solids_boundaries)

# create a bucket feature column for Sulfate

hardness_numeric = tf.feature_column.numeric_column("Hardness")
hardness_boundaries = list(np.arange(int(min(reduced_train_df_norm["Hardness"])),
                                  int(max(reduced_train_df_norm["Hardness"])),
                                  resolution_in_Zs))
hardness = tf.feature_column.bucketized_column(hardness_numeric, hardness_boundaries)

# create a cross feature of solids and sulfate

solids_x_hardness = tf.feature_column.crossed_column([solids, hardness], hash_bucket_size=100)
cross_feature = tf.feature_column.indicator_column(solids_x_hardness)
feature_columns.append(cross_feature)

# Repsresent Potability as a floating point value

conductivity = tf.feature_column.numeric_column("Conductivity")
feature_columns.append(conductivity)

# slectign another feature

carbon = tf.feature_column.numeric_column("Organic_carbon")
feature_columns.append(carbon)

# Convert the list of feature columns into a layer that will later be fed into
# the model. 
my_feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# Plotting the graph function

def plot_the_loss_graph(epochs,mse):
    """Plot a curve of loss vs epoch"""
    
    plt.figure()
    plt.xlabel("Epoch")
    plt.ylabel("Mean Squared Error")
    plt.plot(epochs,mse,label="Loss")
    plt.legend()
    plt.ylim([mse.min()*0.95, mse.max() * 1.03])
    plt.show()

print("Defined the plot the curve function")

In [None]:
def create_model(my_learning_rate, feature_layer):
    """Creaet and compile a simple linear regression model"""
    
    # most simple tf.keras model as sequential
    
    model = tf.keras.models.Sequential()
    
    model.add(my_feature_layer)
    
    model.add(tf.keras.layers.Dense(units=20, 
                                      activation='relu',
                                    name='Hidden1'))

      # Define the second hidden layer with 12 nodes. 
    model.add(tf.keras.layers.Dense(units=12, 
                                      activation='relu',
                                      name='Hidden2'))

      # Define the output layer.
    model.add(tf.keras.layers.Dense(units=1,  
                                      name='Output'))                              

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=my_learning_rate),
                    loss="mean_squared_error",
                    metrics=[tf.keras.metrics.MeanSquaredError()])

    
    return model


def train_model(model, dataset, epochs,bacth_size, label_name):
    """Feed a dataset into the model in order to train it"""
    
    # split the dataset into features and label.
    features = {name:np.array(value) for name, value in dataset.items()}
    label = np.array(features.pop(label_name))
    history = model.fit(x=features, y=label, batch_size= batch_size,
                       epochs=epochs, shuffle=True)
    
    # get the details that will be useful for plotting the loss curve.
    
    epochs = history.epoch
    hist = pd.DataFrame(history.history)
    rmse = hist["mean_squared_error"]
    
    return epochs, rmse
print("Define the create_model and train_model functions")


In [None]:
# the following variables are the hyperparameters.

learning_rate = 0.01
epochs = 200
batch_size = 1000
label_name = "Potability"

# establish the model's topography

my_model = create_model(learning_rate, my_feature_layer)

epochs, mse = train_model(my_model, reduced_train_df_norm, epochs, batch_size, label_name)
plot_the_loss_graph(epochs, mse)

test_features = {name:np.array(value) for name, value in reduced_test_df_norm.items()}
test_label = np.array(test_features.pop(label_name)) # isolate the label
print("\n Evalute the linear regression model against the test set: ")
my_model.evaluate(x = test_features, y= test_label, batch_size = batch_size)


In [None]:
# the following variables are the hyperparameters.

learning_rate = 0.003
epochs = 200
batch_size = 1000
label_name = "Potability"

# establish the model's topography

my_model = create_model(learning_rate, my_feature_layer)

epochs, mse = train_model(my_model, reduced_train_df_norm, epochs, batch_size, label_name)
plot_the_loss_graph(epochs, mse)

test_features = {name:np.array(value) for name, value in reduced_test_df_norm.items()}
test_label = np.array(test_features.pop(label_name)) # isolate the label
print("\n Evalute the linear regression model against the test set: ")
my_model.evaluate(x = test_features, y= test_label, batch_size = batch_size)

In [None]:
output = pd.DataFrame({"Id": reduced_X_valid.index, "Potability": pred_test})
output.to_csv('dong_potability.csv', index=False)