In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import recall_score, precision_score, fbeta_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import eli5
import json
import time
import mlflow

In [None]:
#df_train = pd.read_hdf("../input/train_churn_pred.h5") 
#df_train.shape

In [None]:
def get_or_create_experiment(name):
    experiment = mlflow.get_experiment_by_name(name)
    if experiment is None:
        mlflow.create_experiment(name)
        return mlflow.get_experiment_by_name(name)
    
    return experiment

def _eid(name):
    return get_or_create_experiment(name).experiment_id

In [None]:
def make_experiment(
    df, model, feats=None, 
    mlflow_experiment="churn_prediction",
    make_random_feats=False,
    extra_params={}):
    
    def generate_random(df):
        n = df.shape[0]
        df["random_normal"] = np.random.normal(0, 1, size=n)
        df["random_exponential"] = np.random.exponential(1, size=n)
        df["random_binary"] = np.random.choice([0, 1], size=n)
        df["random_cats"] = np.random.choice(list(range(10)), size=n)
        

    if feats is None:
        black_list = ["id", "churn_probability"]
        feats = [ x for x in df.select_dtypes("number").columns if x not in black_list]
        
        
    random_feats= []
    if make_random_feats:
        generate_random(df)
        random_feats = [x for x in df.columns if x.startswith("random_") ]
        feats += random_feats 
        
        
        
    X = df[feats].values
    y = df["churn_probability"].values

    cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
    
    model_name = (str(model.__repr__).split("of")[-1].strip()
                  .split("(")[0].split(" ")[0].lower().replace("classifier", "")
                 )

    model_params = model.get_params()
    params = {
        "model": model_name,
        "count_feats": len(feats),
        "X.shape": X.shape,
        "y.shape": y.shape,
        "make_random_feats": make_random_feats,
    }
    params.update(model_params)
    params.update(extra_params)
    
    
    #parent mflow
    timeprefix = time.strftime("%Y%m%d%H%M")
    run_name = "{}-{}".format(model_name, timeprefix)
    with mlflow.start_run(experiment_id=_eid(mlflow_experiment), run_name=run_name) as run:
    
        mlflow.log_params(params)
        with open("../artifact/feats.json", "w") as f:
            f.write(json.dumps(feats))
        mlflow.log_artifact("../artifact/feats.json", "json")
        
        with open("../artifact/model_params.json", "w") as f:
            f.write(json.dumps(model_params))
        mlflow.log_artifact("../artifact/model_params.json", "json")

        
        
        df_proba_all = pd.DataFrame()
        for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
            #child mflow
            
            timeprefix = time.strftime("%Y%m%d%H%M")
            run_name = "cv{}-{}-{}".format(cv_idx, model_name, timeprefix)
            with mlflow.start_run(
                experiment_id=_eid(mlflow_experiment), 
                run_name=run_name, nested=True) as run:

            

                X_train, y_train = X[train_idx], y[train_idx]
                X_test, y_test   = X[test_idx], y[test_idx]

                model.fit(X_train, y_train)
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                
                for treshold in [0.1, 0.15, 0.18, 0.2, 0.5]:
                    y_pred = (y_pred_proba > treshold).astype("int")
                    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

                    metrics = {
                        "recall_{}".format(treshold): recall_score(y_test, y_pred),
                        "precision_{}".format(treshold): precision_score(y_test, y_pred),
                        "f1.5_{}".format(treshold): fbeta_score(y_test, y_pred, beta=1.5),

                        "tn_{}".format(treshold): tn,
                        "fp_{}".format(treshold): fp,
                        "fn_{}".format(treshold): fn,
                        "tp_{}".format(treshold): tp,
                    }

                    mlflow.log_metrics(metrics)
                
                df_proba = pd.DataFrame()
                df_proba["y_pred_proba"] = y_pred_proba
                df_proba["y_test"] = y_test
                df_proba["id"] = df_train["id"].values[test_idx]

                df_proba_all = pd.concat([df_proba_all, df_proba])
                
                
                result = eli5.show_weights(model, feature_names=feats, top=len(feats))
                with open("../artifact/eli5.html", "w") as f:
                    f.write("<html>{}</html>".format(result.data))
                mlflow.log_artifact("../artifact/eli5.html", "plot")
                
                mlflow.log_params(params)
    


In [None]:
!mkdir -p ../artifact

In [None]:
# threshold = 0.15
# cls = 1

# df_proba[ 
#     (df_proba["y_pred_proba"] < threshold) &
#     (df_proba["y_test"] == cls)
# ]["y_pred_proba"].hist(color="red");

# df_proba[ 
#     (df_proba["y_pred_proba"] >= threshold) &
#     (df_proba["y_test"] == cls)
# ]["y_pred_proba"].hist(bins=50, color="green");

In [None]:

# make_experiment(
#     df_train, model, 
#     make_random_feats=True,
#     extra_params={"my_idea": "ranom feats"})