In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# WATER POTABILITY PREDICTION (0/1) 
# Results accuracy: 0.9977

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [None]:
water_df = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
water_df.head()

In [None]:
water_df.describe()

In [None]:
water_df.dropna(inplace=True)

In [None]:
water_df.describe()

# MATRIX CORRELATION
Shows no principal factor to decide if water could be potable or not

In [None]:
plt.figure(figsize=(10, 8))
matrix = np.triu(water_df.corr())
sns.heatmap(water_df.corr(), annot=True,linewidth=.8, mask=matrix, cmap="viridis");

**Spliting the data to train/test before convert it to CSV to export to AUTOML**

In [None]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(water_df, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

#to csv to use it with automl
training_data.to_csv("train_df.csv")
testing_data.to_csv("test_df.csv")
water_df.to_csv("alldataclean.csv")

In [None]:
#manual splitting

#water_df[0:1700].to_csv("train_df.csv")
#water_df[1700::].to_csv("test_df.csv")

# AUTOML FOR MODELING WITH BINARY CLASSIFICATION 
(POTABLE OR NOT POTABLE PREDICTION)

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='4G')

In [None]:
# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("./train_df.csv")
test = h2o.import_file("./test_df.csv")

In [None]:
train

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
%%time

# Identify predictors and response
x = train.columns
y = "Potability"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=7200)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# Get leaderboard with `extra_columns` = 'ALL'
lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
lb

In [None]:
# To generate predictions on a test set, you can make predictions
# directly on the `"H2OAutoML"` object or on the leader model
# object directly
preds = aml.predict(test)

In [None]:
aml.leader

In [None]:
import h2o
h2o.init()
from h2o.estimators.gbm import H2OGradientBoostingEstimator

# Import the prostate dataset
data = h2o.import_file("./alldataclean.csv")

# Set the predictor names and the response column name
response = "Potability"
predictors = data.names[0:9]

# Convert the response column to a factor
data['Potability'] = data['Potability'].asfactor()

# Train a GBM model setting nfolds to 5
data_gbm = H2OGradientBoostingEstimator(nfolds = 5, seed = 1)
data_gbm.train(x=predictors, y=response, training_frame=data)

# AUC of cross-validated holdout predictions
data_gbm.auc(xval=True)