## Tutorial based on https://github.com/alteryx/evalml

### TODO methodology ideas:
* algorithmic bias

### Workflow
* semantic commits
* git flow

### Documentation
* add problem_type argument to documentation
* update woodwork documentation without the dreaded value slice error: https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html

In [None]:
from zipfile import ZipFile

import pandas as pd

import evalml
from evalml.data_checks import HighlyNullDataCheck, NoVarianceDataCheck, ClassImbalanceDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck, IDColumnsDataCheck, MulticollinearityDataCheck, OutliersDataCheck
from evalml.objectives import get_core_objectives
from evalml.problem_types import detect_problem_type
from evalml.utils import infer_feature_types

from evalml.automl import AutoMLSearch
from kaggle.api.kaggle_api_extended import KaggleApi

import woodwork as ww

In [None]:
ww.config.set_option('numeric_categorical_threshold', 2)
ww.config.set_option('natural_language_threshold', 20)

In [None]:
ww.config

In [None]:
def save_kaggle_data():
    # token stored in .kaggle/kaggle.json
    api = KaggleApi()
    api.authenticate()
    
    # must accept competition rules on kaggle.com
    api.competition_download_files('titanic')
    
    zf = ZipFile('titanic.zip')
    zf.extractall('data/')
    zf.close()  

In [None]:
save_kaggle_data()

In [None]:
def print_checks(check, **kwargs):
    results = check.validate(**kwargs)
    for message in results['warnings']:
        print(f"Warning: {message['message']}")
    for message in results['errors']:
        print(f"Error: {message['message']}")
    
def get_relevant_objectives(prob_type):
    for objective in get_core_objectives(prob_type):
        yield objective.name    
    
def check_data(x, y):
    null_check = HighlyNullDataCheck(pct_null_threshold=0.5)
    print_checks(null_check, X=x)
    
    nv_check = NoVarianceDataCheck()
    print_checks(nv_check, X=x, y=y)
    
    ci_check = ClassImbalanceDataCheck(threshold=0.1)
    print_checks(ci_check, X=x, y=y)
    
    tl_check = TargetLeakageDataCheck(pct_corr_threshold=0.7)
    print_checks(tl_check, X=x, y=y)
    
    prob_type = str(detect_problem_type(y))
    for obj in get_relevant_objectives(prob_type):
        inv_check = InvalidTargetDataCheck(prob_type, obj)
        print_checks(inv_check, X=x, y=y)
    
    id_check = IDColumnsDataCheck(id_threshold=0.9)
    print_checks(id_check, X=x, y=y)
    
    mc_check = MulticollinearityDataCheck(threshold=0.8)
    print_checks(mc_check, X=x, y=y)
    
    out_check = OutliersDataCheck()
    print_checks(out_check, X=x, y=y)
    
    
def process_kaggle_data(split_name, index = 'PassengerId', y = 'Survived'):
    data = pd.read_csv(f'data/{split_name}.csv')
    
    x_df = data.drop([y, index, 'Name', 'Ticket'], axis = 1)
    y_df = data[y]
    print(detect_problem_type(y_df))
    
    check_data(x_df, y_df)
    
    return x_df, y_df

In [None]:
X, y = process_kaggle_data('train')
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type = 'binary', test_size=0.2)

In [None]:
X_train

In [None]:
automl = AutoMLSearch(X_train, y_train, 
                      problem_type = 'binary',
                      max_batches = 10,
                      max_iterations = 10)

In [None]:
automl.search()

In [None]:
automl.describe_pipeline(8)

In [None]:
automl.rankings

In [None]:
automl.full_rankings

In [None]:
pipeline = automl.best_pipeline
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

In [None]:
preds

In [None]:
pipeline.name

In [None]:
pipeline.parameters