In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning)

## Business Objective
### Survival Prediction
Predict if a passenger is likely to survive.

### Setup

In [3]:
import os
from ds_capability import FeatureSelect, FeatureEngineer, FeatureTransform, AutoML, Controller

In [4]:
os.environ['HADRON_PREDICT_SOURCE_DATA'] = 'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/datasets/toy_sample/titanic.csv'
os.environ['HADRON_PREDICT_PERSIST_DATA'] = './hadron/data/hadron_docs_titanic_predict.parquet'

## Exploratory Data Analysis

In [5]:
fs = FeatureSelect.from_memory()

tbl = fs.set_source_uri('${HADRON_PREDICT_SOURCE_DATA}').load_source_canonical()

In [6]:
fs.canonical_report(tbl)

Unnamed: 0,Attributes,DataType,Nulls,Dominate,Valid,Unique,Observations
0,pclass,int64,0.0%,54.2%,1309,3,"[3, 1, 2]"
1,survived,int64,0.0%,61.8%,1309,2,"[0, 1]"
2,name,string,0.0%,0.2%,1309,1307,"['Connolly, Miss. Kate', 'Kelly, Mr. James', 'Alle..."
3,sex,string,0.0%,64.4%,1309,2,"['male', 'female']"
4,age,double,20.1%,20.1%,1046,98,"[24.0, 22.0, 21.0, 30.0, 18.0, 25.0, 28.0, 36.0, 2..."
5,sibsp,int64,0.0%,68.1%,1309,7,"[0, 1, 2, 4, 3, 8, 5]"
6,parch,int64,0.0%,76.5%,1309,8,"[0, 1, 2, 3, 4, 5, 6, 9]"
7,ticket,string,0.0%,0.8%,1309,929,"['CA. 2343', '1601', 'CA 2144', 'PC 17608', 'S.O.C..."
8,fare,double,0.1%,4.6%,1308,281,"[8.05, 13.0, 7.75, 26.0, 7.8958, 10.5, 7.775, 7.22..."
9,cabin,string,0.0%,77.5%,1309,182,"['', 'F', 'C23', 'B57', 'G6', 'C22', 'B96', 'C78',..."


## Preprocessing

### Feature Selection

In [7]:
fs = FeatureSelect.from_env('survived', has_contract=False)

fs.set_source_uri('${HADRON_PREDICT_SOURCE_DATA}')
fs.set_persist_uri('event://select')

tbl = fs.load_source_canonical()

<ds_capability.components.feature_select.FeatureSelect at 0x7fe26771e8f0>

<ds_capability.components.feature_select.FeatureSelect at 0x7fe26771e8f0>

In [8]:
fs.report_environ()

Unnamed: 0,environ,value
0,HADRON_DEFAULT_PATH,/Users/doatridge/code/jupyter/docs
1,HADRON_DEFAULT_MODULE,ds_core.handlers.event_handlers
2,HADRON_DEFAULT_HANDLER,EventPersistHandler
3,HADRON_PM_PATH,/Users/doatridge/code/jupyter/docs/getting_started/hadron/contracts
4,HADRON_PM_REPO,not used
5,HADRON_PM_TYPE,default
6,HADRON_PM_MODULE,ds_capability.handlers.pyarrow_handlers
7,HADRON_PM_HANDLER,PyarrowPersistHandler
8,HADRON_CREATOR,account default
9,HADRON_CONTENT,https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/


In [9]:
# extract label
fs.add_connector_uri('label', uri='event://label')
label = fs.tools.auto_drop_columns(tbl, headers=['survived'], drop=True)
fs.save_canonical('label', label)

<ds_capability.components.feature_select.FeatureSelect at 0x7fe26771e8f0>

In [10]:
tbl = fs.tools.auto_drop_columns(tbl, headers=['name', 'boat', 'body', 'home.dest', 'ticket', 'survived'])

In [11]:
fs.run_component_pipeline()

### Feature Engineering

In [12]:
fe = FeatureEngineer.from_env('survived', has_contract=False)

fe.set_source_uri('event://select')
fe.set_persist_uri('event://engineer')

tbl = fe.load_source_canonical()

<ds_capability.components.feature_engineer.FeatureEngineer at 0x7fe26771fcd0>

<ds_capability.components.feature_engineer.FeatureEngineer at 0x7fe26771fcd0>

#### extract cabin features

In [13]:
tbl = fe.tools.correlate_on_pandas(tbl, header='cabin',
                                code_str="apply(lambda x: x[0] if isinstance(x, str) and len(x) > 0 else None)",
                                to_header='cabin_level', intent_order=-1)
tbl = fe.tools.correlate_on_pandas(tbl, header='cabin',
                                code_str="str.extract('([0-9]+)').astype('float')",
                                to_header='cabin', intent_order=-1)

#### missing data imputation

In [14]:
tbl.shape
for n in tbl.column_names:
    c = tbl.column(n)
    print(f"{n}: {c.null_count}")

(1309, 9)

pclass: 0
sex: 0
age: 263
sibsp: 0
parch: 0
fare: 1
embarked: 0
cabin_level: 1014
cabin: 1027


In [15]:
tbl = fe.tools.correlate_missing(tbl, header='fare', strategy='mean', intent_order=-1)
tbl = fe.tools.correlate_missing(tbl, header='age', strategy='constant', constant=-1, intent_order=-1)
tbl = fe.tools.correlate_missing(tbl, header='cabin', strategy='constant', constant=-1, intent_order=-1)

In [16]:
tbl = fe.tools.correlate_missing_probability(tbl, header='cabin_level', intent_order=-1)

In [17]:
fe.run_component_pipeline()

### Feature Transformation

In [18]:
ft = FeatureTransform.from_env('survived', has_contract=False)

ft.set_source_uri('event://engineer')
ft.set_persist_uri('event://transform')

tbl = ft.load_source_canonical()

<ds_capability.components.feature_transform.FeatureTransform at 0x7fe267785ae0>

<ds_capability.components.feature_transform.FeatureTransform at 0x7fe267785ae0>

In [19]:
# rare label encoding
tbl = ft.tools.encode_category_integer(tbl, headers=['cabin_level'], label_count=6, intent_order=-1)

# ordinal
tbl = ft.tools.encode_category_integer(tbl, headers=['sex', 'embarked'], ordinal=True, intent_order=-1)

In [20]:
ft.run_component_pipeline()

In [21]:
ft.canonical_report(ft.load_persist_canonical())

Unnamed: 0,Attributes,DataType,Nulls,Dominate,Valid,Unique,Observations
0,pclass,int64,0.0%,54.2%,1309,3,"[3, 1, 2]"
1,sibsp,int64,0.0%,68.1%,1309,7,"[0, 1, 2, 4, 3, 8, 5]"
2,parch,int64,0.0%,76.5%,1309,8,"[0, 1, 2, 3, 4, 5, 6, 9]"
3,fare,double,0.0%,4.6%,1309,282,"[8.05, 13.0, 7.75, 26.0, 7.8958, 10.5, 7.775, 7.22..."
4,age,double,0.0%,20.1%,1309,99,"[-1.0, 24.0, 22.0, 21.0, 30.0, 18.0, 25.0, 28.0, 3..."
5,cabin,double,0.0%,78.5%,1309,105,"[-1.0, 6.0, 33.0, 22.0, 101.0, 23.0, 34.0, 78.0, 2..."
6,cabin_level,int64,0.0%,31.9%,1309,7,"[2, 1, 3, 4, 0, 5, 6]"
7,sex,int64,0.0%,64.4%,1309,2,"[1, 0]"
8,embarked,int64,0.0%,69.8%,1309,4,"[3, 1, 2, 0]"


## Model Discovery

In [22]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [23]:
a = AutoML.from_memory()

a.set_source_uri('event://transform')
a.add_connector_uri('label', 'event://label')

tbl = a.load_source_canonical()
label = a.load_canonical('label')

<ds_capability.components.automl.AutoML at 0x7fe2677876a0>

<ds_capability.components.automl.AutoML at 0x7fe2677876a0>

In [24]:
X = np.asarray(tbl)
y = np.asarray(label)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [25]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train)

## Classifier Predict

In [26]:
aml = AutoML.from_env('survived', has_contract=False)

# reset the connectors
aml.set_source_uri('event://transform')
# aml.set_persist_uri('event://predict')
aml.set_persist_uri('${HADRON_PREDICT_PERSIST_DATA}')

tbl = aml.load_source_canonical()

<ds_capability.components.automl.AutoML at 0x7fe267ab5600>

<ds_capability.components.automl.AutoML at 0x7fe267ab5600>

In [27]:
aml.add_trained_model(model_name='GradientBoost', trained_model=model)

In [28]:
predict = aml.tools.label_predict(tbl, model_name='GradientBoost')

In [29]:
aml.run_component_pipeline()

## Controller

In [30]:
ctrl = Controller.from_env(has_contract=False)

In [31]:
ctrl.register.feature_select('survived')
ctrl.register.feature_engineer('survived')
ctrl.register.feature_transform('survived')
ctrl.register.automl('survived')

(1309, 8)

(1309, 9)

(1309, 9)

(1309, 1)

In [32]:
ctrl.run_controller()

### Review Run

In [33]:
result = aml.load_persist_canonical()
result.shape
result

(1309, 1)

pyarrow.Table
predict: int64
----
predict: [[1,0,1,0,1,...,0,0,0,0,0]]