# Imports

In [None]:
# Imports:
# ----------------------------------------------------------------------------------------------------------------------
# Annotations
from typing import Optional, List, Dict, Any

# Data Manipulations
import numpy as np
import pandas as pd

# Static plots
import matplotlib.pyplot as plt
import seaborn as sns

# Interactive plots
import plotly
from plotly.offline import iplot
import plotly.express as px
import cufflinks as cf

from sklearn.metrics import confusion_matrix

# ----------------------------------------------------------------------------------------------------------------------
# Plots config
sns.set()

cf.go_offline()
plotly.offline.init_notebook_mode()
cf.set_config_file(world_readable=True, theme='space', offline=True)


def show_feats_distributions(
        df: pd.DataFrame,
        feats: Optional[list] = None
) -> None:
    """
    Plots features distributions histograms

    :param df: DataFrame
    :param feats: Features to plot
    """
    if feats is None:
        feats = df.columns
    for feat in feats:
        df[feat].iplot(kind='hist', title=feat)


def show_target_distributions_by_feats(
        df: pd.DataFrame,
        target: str,
        feats: Optional[list] = None
) -> None:
    """
        Plots features distributions histograms, grouped by target values

        :param df: DataFrame
        :param target: Target
        :param feats: Features to plot
        """
    if feats is None:
        feats = df.columns
    for feat in feats:
        if feat == target:
            continue
        if df[feat].nunique() > 10:
            iplot(px.histogram(data_frame=df, x=feat, color=target, barmode='overlay',
                               opacity=0.6, template='plotly_dark'))
        else:
            iplot(px.histogram(data_frame=df, x=feat, color=target, barmode='group',
                               opacity=0.6, template='plotly_dark'))


def show_correlations(
        df: pd.DataFrame,
        targets: Optional[List[str]] = None,
        interactive: bool = False
) -> None:
    """
        Displays the correlation matrix as heatmap

        :param df: DataFrame
        :param targets: feats
        :param interactive: True / [False]
    """
    if targets is None:
        targets = df.columns
    df = df[targets]
    if interactive:
        fig = px.imshow(
            df.corr(),
            x=df.columns,
            y=df.columns
        )
        fig.show()
    else:
        sns.heatmap(df.corr(), cmap='inferno')

# Data

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv")
df.describe()

In [None]:
target = 'Potability'
bad, good = df[df[target] == 0], df[df[target] == 1]
wdf = bad.sample(good.shape[0])
wdf = pd.concat([wdf, good])
wdf[target].mean()

In [None]:
nafeats = df.columns[df.isna().any()]
nafeats

In [None]:
for f in nafeats:
    df[f] = pd.concat([good[f].fillna(good[f].dropna().mean()),
                       bad[f].fillna(bad[f].dropna().mean())])

In [None]:
show_feats_distributions(wdf)

In [None]:
show_target_distributions_by_feats(wdf, target=target)

In [None]:
df['ph_2'] = df['ph'] ** 2
df['sulf_2'] = df['Sulfate'] ** 2
df['chl_2'] = df['Chloramines'] ** 2
df.drop(columns=['ph', 'Sulfate', 'Chloramines'], inplace=True)

In [None]:
show_correlations(wdf)

# Model selection

In [None]:
def show_confusion_matrix(
        actual: pd.Series,
        predict: pd.Series
) -> None:
    cfm = confusion_matrix(actual, predict)
    group_names = ['TN', 'FP', 'FN', 'TP']
    group_percentages = [
        '{0:.2%}'.format(value)
        for value in cfm.flatten() / np.sum(cfm)
    ]
    labels = [
        f"{v2}\n{v3}"
        for v2, v3 in zip(group_names, group_percentages)
    ]
    labels = np.asarray(labels).reshape(2, 2)
    sns.heatmap(cfm, annot=labels, fmt='', cmap='inferno')
    plt.show()


def fit_predict(model, train_x, train_y, test_x):
    model.fit(train_x, train_y)
    return pd.Series(model.predict(test_x))


def try_models(
        _models: Dict[str, Any],
        _metrics: Dict[str, Any],
        x_train, y_train,
        x_test, y_test,
        fp=fit_predict
) -> None:
    for model_name, model in _models.items():
        real_y, pred_y = y_test, fp(model, x_train, y_train, x_test)
        print(f"\n{model_name}:")
        for metric_name, metric in _metrics.items():
            print(f"{metric_name}: {round(metric(real_y, pred_y), 4)}")
        show_confusion_matrix(real_y, pred_y)


In [None]:
from sklearn.model_selection import train_test_split

X, y = df.drop(columns=[target]), df[target]
tX, vX, ty, vy = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

models = {
    "Random Forest": RandomForestClassifier(random_state=0, class_weight='balanced'),
    "CatBoost": CatBoostClassifier(verbose=False),
    "SGDClassifier": SGDClassifier()
}

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
metrics = {'Accuracy': accuracy_score, 'ROC-AUC': roc_auc_score}

In [None]:
try_models(models, metrics, tX, ty, vX, vy)

# Hyperparameters optimization

In [None]:
import optuna

def rf_objective(trial):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 10, 100),
        "max_depth": trial.suggest_categorical("max_depth", [7, 8, 9, 10, 11, 12, None]),
        "criterion": trial.suggest_categorical('criterion', ["gini", "entropy"]),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 5),
        "min_samples_leaf": trial.suggest_categorical('min_samples_leaf', [1, 2]),
        "max_features": trial.suggest_categorical('max_features', ["auto", "sqrt", "log2"]),
        "class_weight": trial.suggest_categorical('class_weight', ["balanced"]),
        "random_state": trial.suggest_categorical('random_state', [0]),
        "n_jobs": trial.suggest_categorical('n_jobs', [-1]),
    }
    model = RandomForestClassifier(**params)
    model.fit(tX, ty)
    return -accuracy_score(vy, model.predict(vX))

In [None]:
# study = optuna.create_study()
# study.optimize(rf_objective, n_trials=200, timeout=3600 * 2)
# print(f"Best RandomForest accuracy: {-round(study.best_value, 4)} with parameters {study.best_params}\n\n")

rf_best_score = 0.8132
rf_best_params = {
    'n_estimators': 81, 'criterion': 'gini',
    'min_samples_split': 2, 'min_samples_leaf': 1,
    'max_features': 'log2', 'class_weight': 'balanced',
    'random_state': 0, 'n_jobs': -1}
rf = RandomForestClassifier(**rf_best_params)
try_models({'Final Model': rf}, metrics, tX, ty, vX, vy)

***We learned good model with ~86% precision***