In [None]:
!pip install hyperopt
!pip install mlflow
!pip install pyspark
!pip install findspark

In [None]:
import mlflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from sklearn.feature_selection import f_classif, chi2
import pyspark
from sklearn.model_selection import cross_val_score, cross_validate
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
from scipy.stats import shapiro
import warnings

warnings.filterwarnings('ignore')

# Import Data

In [None]:
data = pd.read_csv('../input/water-potability/water_potability.csv')

# First Exploration

In [None]:
data.info()

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
data.hist(grid=True,ax=ax)

we can see that the data is unbalanced. More we have some missing values to deal with.

we shall replace this Nan with the means

# Replace Nan with means with respect to their class

In [None]:
##################################### PH #####################################

phMean_0 = data[data['Potability'] == 0]['ph'].mean(skipna=True)
data.loc[(data['Potability'] == 0) & (data['ph'].isna()), 'ph'] = phMean_0
phMean_1 = data[data['Potability'] == 1]['ph'].mean(skipna=True)
data.loc[(data['Potability'] == 1) & (data['ph'].isna()), 'ph'] = phMean_1

##################################### Sulfate #####################################

SulfateMean_0 = data[data['Potability'] == 0]['Sulfate'].mean(skipna=True)
data.loc[(data['Potability'] == 0) & (data['Sulfate'].isna()), 'Sulfate'] = SulfateMean_0
SulfateMean_1 = data[data['Potability'] == 1]['Sulfate'].mean(skipna=True)
data.loc[(data['Potability'] == 1) & (data['Sulfate'].isna()), 'Sulfate'] = SulfateMean_1

################################ Trihalomethanes#####################################

TrihalomethanesMean_0 = data[data['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
data.loc[(data['Potability'] == 0) & (data['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_0
TrihalomethanesMean_1 = data[data['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
data.loc[(data['Potability'] == 1) & (data['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_1

# Let's check correlation

In [None]:
data.corr().style.background_gradient(cmap='coolwarm')

we have not significant correlation between features

# Undersampling

In [None]:
under = RandomUnderSampler(sampling_strategy=0.9)
X, y = under.fit_resample(data.iloc[:,:-1], data.iloc[:,-1])

# Let's train some models

## first we scale the data as the features are continuous

In [None]:
## standard scaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=data.columns[:-1])

# Logistic Regression

## we first try to select some feature to train a logistic regression

In [None]:
# https://towardsdatascience.com/mistakes-in-applying-univariate-feature-selection-methods-34c43ce8b93d

# Our features are continous
# as our target is categorical we can perform a ANOVA Test to check an linear depencies between features and the target
N_continuous = 5
N_categorical = 5

f_scores = f_classif(X_scaled_df.values,y)
_, p = f_scores
most_f = p.argsort()
most_dependent_numerical_variables = [X_scaled_df.columns[element] for element in most_f[:N_continuous]]

most_dependent_numerical_variables

### Logistic regression with the selected features and Mlflow

In [None]:
### Select the most important features

X_selected = X_scaled_df[most_dependent_numerical_variables]
               
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
  X_selected,
  y,
  test_size=0.2,
  random_state=0
)


with mlflow.start_run(run_name='logistic_regression') as run:
  model = LogisticRegression(C=0.2, class_weight='None', penalty='l2',
                   solver='saga',random_state=0)
  
  # Models, parameters, and training metrics are tracked automatically
  model.fit(X_train, y_train)

  predicted_probs = model.predict_proba(X_test)
  roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  
  # The AUC score on test data is not automatically logged, so log it manually
  mlflow.log_metric("test_auc", roc_auc)
  print("Test AUC of: {}".format(roc_auc))

## Train LR model with MLflow (all features)

In [None]:
# we split the data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
  X,
  y,
  test_size=0.2,
  random_state=0
)

### let's train and evaluate  our model

In [None]:
#with mlflow.start_run(run_name='logistic_regression') as run:
    #model_4 = LogisticRegression(random_state=0)
  
    # Models, parameters, and training metrics are tracked automatically
    #model_4.fit(X_train, y_train)

    #predicted_probs = model_4.predict_proba(X_test)
    #roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  
    # The AUC score on test data is not automatically logged, so log it manually
    #mlflow.log_metric("test_auc", roc_auc)
    #print("Test AUC of: {}".format(roc_auc))

AUC is lesser than before

## GreadSearch+Logistic Regression

In [None]:
# GridSearch
param={'solver':('Newton-cg','lbfgs','liblinear', 'sag', 'saga'),'penalty':('l1', 'l2', 'elasticnet', 'none'),
       'n_jobs':(-1,1,2,3),'class_weight':('balanced','None'),
       'C':(0.2, 0.1, 0.15, 0.12, 0.3, 0.4, 0.6, 0.5, 1, 1.2, .3, 1.4, 1.5),
       'max_iter':(100, 150, 200)}

h = GridSearchCV(LogisticRegression(), param, cv=5, n_jobs=5).fit(X_train, y_train)

h.best_estimator_

### let's train and evaluate our model with mlflow

In [None]:
with mlflow.start_run(run_name='logistic_regression') as run:
  model_3 =LogisticRegression(C=0.3, class_weight='None', penalty='l1',
                   solver='saga',max_iter=150,random_state=0)
  
  # Models, parameters, and training metrics are tracked automatically
  model_3.fit(X_train, y_train)

  predicted_probs = model_3.predict_proba(X_test)
  roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  
  # The AUC score on test data is not automatically logged, so log it manually
  mlflow.log_metric("test_auc", roc_auc)
  print("Test AUC of: {}".format(roc_auc))

# Gradient Boosting

## Gradient Boosting with MLflow

In [None]:
with mlflow.start_run(run_name='gradient_boost') as run:
  model_2 = sklearn.ensemble.GradientBoostingClassifier(
    random_state=0, 
    
    # Try a new parameter setting for n_estimators
    n_estimators=200,
  )
  model_2.fit(X_train, y_train)

  predicted_probs = model_2.predict_proba(X_test)
  roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
  mlflow.log_metric("test_auc", roc_auc)
  print("Test AUC of: {}".format(roc_auc))

In [None]:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=137, learning_rate=0.18,max_depth=3, random_state=0)

scoring = ['accuracy', 'roc_auc', 'f1', 'precision', 'recall']

result = cross_validate(clf, X_train, y_train, cv=5, scoring=scoring, return_estimator=True)

print("Accuracy confidence interval: {} +- {}".format(result['test_accuracy'].mean(), result['test_accuracy'].std()))
print("ROC AUC confidence interval: {} +- {}".format(result['test_roc_auc'].mean(), result['test_roc_auc'].std()))
print("F1 confidence interval: {} +- {}".format(result['test_f1'].mean(), result['test_f1'].std()))
print("Precison confidence interval: {} +- {}".format(result['test_precision'].mean(), result['test_precision'].std()))
print("Recall confidence interval: {} +- {}".format(result['test_recall'].mean(), result['test_recall'].std()))

In [None]:
result = cross_validate(clf, X_test, y_test, cv=5, scoring=scoring, return_estimator=True)

print("Accuracy confidence interval: {} +- {}".format(result['test_accuracy'].mean(), result['test_accuracy'].std()))
print("ROC AUC confidence interval: {} +- {}".format(result['test_roc_auc'].mean(), result['test_roc_auc'].std()))
print("F1 confidence interval: {} +- {}".format(result['test_f1'].mean(), result['test_f1'].std()))
print("Precison confidence interval: {} +- {}".format(result['test_precision'].mean(), result['test_precision'].std()))
print("Recall confidence interval: {} +- {}".format(result['test_recall'].mean(), result['test_recall'].std()))

## Parallel training with Hyperopt and SparkTrials (need to set up spark)

In [None]:
def train_model(params):
  mlflow.autolog()
  with mlflow.start_run(nested=True):
    model_hp = sklearn.ensemble.GradientBoostingClassifier(
      random_state=0,
      **params
    )
    model_hp.fit(X_train, y_train)
    predicted_probs = model_hp.predict_proba(X_test)
    # Tune based on the test AUC
    # In production settings, you could use a separate validation set instead
    roc_auc = sklearn.metrics.roc_auc_score(y_test, predicted_probs[:,1])
    mlflow.log_metric('test_auc', roc_auc)
    
    # Set the loss to -1*auc_score so fmin maximizes the auc_score
    return {'status': STATUS_OK, 'loss': -1*roc_auc}

# SparkTrials distributes the tuning using Spark workers
# Greater parallelism speeds processing, but each hyperparameter trial has less information from other trials
# i choose parallelism =2
#spark_trials = SparkTrials(
  #parallelism=4
#)

search_space = {
  'n_estimators': scope.int(hp.quniform('n_estimators', 20, 1000, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'max_depth': scope.int(hp.quniform('max_depth', 2, 5, 1)),
}


with mlflow.start_run(run_name='gb_hyperopt') as run:
  # Use hyperopt to find the parameters yielding the highest AUC
  best_params = fmin(
    fn=train_model, 
    space=search_space, 
    algo=tpe.suggest, 
    max_evals=32)

In [None]:
#best_run = mlflow.search_runs(
#  order_by=['metrics.test_auc DESC', 'start_time DESC'],
#  max_results=10,
#).iloc[0]
#print('Best Run')
#print('AUC: {}'.format(best_run["metrics.test_auc"]))
#print('Num Estimators: {}'.format(best_run["params.n_estimators"]))
#print('Max Depth: {}'.format(best_run["params.max_depth"]))
#print('Learning Rate: {}'.format(best_run["params.learning_rate"]))