## Tutorial based on https://github.com/alteryx/evalml

### TODO methodology ideas:
* algorithmic bias
* calibration objective
* sensitivity at low alert rates objective

### Workflow
* semantic commits
* git flow

### Documentation
* add problem_type argument to documentation
* update woodwork documentation without the dreaded value slice error: https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html

In [None]:
from zipfile import ZipFile
import logging.config

import pandas as pd
import numpy as np
import woodwork as ww

from kaggle.api.kaggle_api_extended import KaggleApi

import evalml
from evalml.data_checks import DataCheck, DataChecks, HighlyNullDataCheck, NoVarianceDataCheck, ClassImbalanceDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck, IDColumnsDataCheck, MulticollinearityDataCheck, OutliersDataCheck
from evalml.model_understanding import confusion_matrix, get_prediction_vs_actual_data, explain_predictions, explain_predictions_best_worst
from evalml.objectives import get_core_objectives
from evalml.objectives.binary_classification_objective import BinaryClassificationObjective
from evalml.problem_types import detect_problem_type

from evalml.automl import AutoMLSearch

from IPython.core.debugger import set_trace

In [None]:
ww.config.set_option('numeric_categorical_threshold', 2)
ww.config.set_option('natural_language_threshold', 20)

In [None]:
ww.config

In [None]:
def save_kaggle_data():
    # token stored in .kaggle/kaggle.json
    api = KaggleApi()
    api.authenticate()
    
    # must accept competition rules on kaggle.com
    api.competition_download_files('titanic')
    
    zf = ZipFile('titanic.zip')
    zf.extractall('data/')
    zf.close()  

In [None]:
save_kaggle_data()

In [None]:
def print_checks(check, **kwargs):
    results = check.validate(**kwargs)
    for message in results['warnings']:
        print(f"Warning: {message['message']}")
    for message in results['errors']:
        print(f"Error: {message['message']}")
    
def get_relevant_objectives(prob_type):
    for objective in get_core_objectives(prob_type):
        yield objective.name    
    
def check_data(x, y):
    null_check = HighlyNullDataCheck(pct_null_threshold=0.5)
    print_checks(null_check, X=x)
    
    nv_check = NoVarianceDataCheck()
    print_checks(nv_check, X=x, y=y)
    
    ci_check = ClassImbalanceDataCheck(threshold=0.1)
    print_checks(ci_check, X=x, y=y)
    
    tl_check = TargetLeakageDataCheck(pct_corr_threshold=0.7)
    print_checks(tl_check, X=x, y=y)
    
    prob_type = str(detect_problem_type(y))
    for obj in get_relevant_objectives(prob_type):
        inv_check = InvalidTargetDataCheck(prob_type, obj)
        print_checks(inv_check, X=x, y=y)
    
    id_check = IDColumnsDataCheck(id_threshold=0.9)
    print_checks(id_check, X=x, y=y)
    
    mc_check = MulticollinearityDataCheck(threshold=0.8)
    print_checks(mc_check, X=x, y=y)
    
    out_check = OutliersDataCheck()
    print_checks(out_check, X=x, y=y)
    
    
def process_kaggle_data(split_name, index = 'PassengerId', y = 'Survived'):
    data = pd.read_csv(f'data/{split_name}.csv')
    
    x_df = data.drop([y, index, 'Name', 'Ticket'], axis = 1)
    y_df = data[y]
    print(detect_problem_type(y_df))
    
    check_data(x_df, y_df)
    
    return x_df, y_df

In [None]:
X, y = process_kaggle_data('train')
print(X.shape)

In [None]:
evalml.preprocessing.target_distribution(y)

In [None]:
data_checks = DataChecks(data_checks=[HighlyNullDataCheck, NoVarianceDataCheck, 
                                      ClassImbalanceDataCheck, TargetLeakageDataCheck, 
                                      InvalidTargetDataCheck, IDColumnsDataCheck,
                                      MulticollinearityDataCheck, OutliersDataCheck],
                        data_check_params={'InvalidTargetDataCheck':{'problem_type':'binary',
                                                                     'objective':'Log Loss Binary'}})

In [None]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, 
                                                                   problem_type = 'binary',
                                                                   test_size=0.2, random_seed=44133)

In [None]:
class SLA(BinaryClassificationObjective):
    name = "Sensitivity at Low Alert Rates"
    greater_is_better = True
    score_needs_proba = True
    perfect_score = 1.0
    is_bounded_like_percentage = True
    
    def __init__(self, alert_rate=0.01):
        """Create instance of SLA
        
        Arguments:
            alert_rate (float): percentage of top scores to use in calculating sensitivity
        
        """
        self.alert_rate = alert_rate
        
    def objective_function(self, y_true,  ypred_proba, X=None):
        """Calculate sensitivity for the top alert_rate % observations
        
        Arguments:
            y_true (pd.Series): true labels
            ypred_proba (pd.Series): predicted probabilities
        
        Returns:
            float: sensitivity for the observations with the top predicted probabilities
        """
        
        if not isinstance(ypred_proba, pd.Series):
            ypred_proba = pd.Series(ypred_proba)
            
        if not isinstance(y_true, pd.Series):
            y_true = pd.Series(y_true)
            
        prob_thresh = np.quantile(ypred_proba, 1-self.alert_rate)
        logging.info(f"Calculating sensitivity at threshold {prob_thresh}")
        high_risk = ypred_proba.astype(float) >= prob_thresh
        
        tp = y_true & high_risk
        fn = y_true & (~high_risk)
        # TODO: tp.sum() + fn.sum() > 0
        # TODO: prob_thresh = 0 / all ypred_proba are 0
        sensitivity = tp.sum()/(tp.sum()+fn.sum())
        
        return sensitivity
        
        

In [None]:
sla_objective = SLA()

In [None]:
evalml.pipelines.components.utils.allowed_model_families('binary')

In [None]:
automl = AutoMLSearch(X_train, y_train, 
                      problem_type = 'binary',
                      objective=sla_objective,
                      max_time = 1000,
                      allowed_model_families=['random_forest','decision_tree','catboost',
                                              'linear_model','extra_trees'],
                      ensembling=True
                     )

logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': True,
})

automl.search(data_checks=data_checks)

In [None]:
automl.full_rankings

In [None]:
selected_id = automl.full_rankings[automl.full_rankings.score != 1].head(1)['id'].item()

In [None]:
logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': False,
})
automl.describe_pipeline(automl.full_rankings['id'][selected_id])

In [None]:
# pipeline = automl.best_pipeline
pipeline = automl.get_pipeline(selected_id)
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_test, y_test, objectives = ["auc",sla_objective])

In [None]:
y_preds = pipeline.predict_proba(X_test)[True]

In [None]:
explain_predictions(pipeline, X_train, y_train, [0], include_shap_values = True, output_format = "dataframe")

In [None]:
explain_predictions_best_worst(pipeline, X_train, y_train, num_to_explain = 2, 
                               top_k_features = 3, include_shap_values = True,
                              output_format="dataframe")