In [202]:
import sys

import pandas as pd
import numpy as np
import scipy
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from tensorflow import keras

sys.path.append('../src')
import columnar as col

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load dataset

In [172]:
df = col.loaders.petfinder._load('../')
feature_selection = col.FeatureSelection(**col.loaders.petfinder._select_features(df))

# split data
df_train, df_test = train_test_split(df, test_size=.2, random_state=123)

X_train, y_train = feature_selection.select_features(df_train)
ds_train = col.embeddings.data.df_to_dataset(X_train, y_train, batch_size=32)

X_test, y_test = feature_selection.select_features(df_test)
ds_test = col.embeddings.data.df_to_dataset(X_test, y_test, batch_size=32, shuffle=False)

# setup scorer
scorer = col.Scorer(
    acc=lambda ytrue, ypreds: metrics.accuracy_score(ytrue, ypreds > .5),
    f1=lambda ytrue, ypreds: metrics.f1_score(ytrue, ypreds > .5),
    auc=metrics.roc_auc_score,
)

    
categoricals = ['breed1', 'type']
numericals = ['fee']    
dataset = col.embeddings.data.df_to_dataset(df_train[categoricals + numericals], df_train.target)

def assert_equal(a,b, msg, prec=3):
    diff = abs(a - b)
    assert diff < 10 ** (-prec), msg


## test embeddings.layers

In [134]:
# testing TFNormalizationLayer
column = 'fee'
normalizer = col.embeddings.layers.TFNormalizationLayer(col_name='fee')
normalizer.adapt(dataset)
weights = normalizer.normalizer.get_weights()
assert_equal(weights[0],  20.781, "normalizer weights are abnormal", prec=2)
assert_equal(weights[2], 11994, "normalizer weights are abnormal")

# testing TFEmbeddingLayer
column = 'breed1'
emb_layer = col.embeddings.layers.TFEmbeddingLayer(column, 'max50')
emb_layer.adapt(dataset)
assert type(emb_layer.embedding) == keras.layers.Embedding, "TFEmbeddingLayer does not have the right type for the embedding attribute"
assert emb_layer.embedding.output_dim == 50, "embedding layer does not output the right embedding size"

emb_layer = col.embeddings.layers.TFEmbeddingLayer(column, 'max2')
emb_layer.adapt(dataset)
assert emb_layer.embedding.output_dim == 2, "embedding layer does not output the right embedding size"


2022-02-17 23:07:01.693948: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:07:01.708545: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:07:02.961498: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:07:03.549191: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


## test embeddings.models

In [194]:
# test BaseTransformStrategy
strat = col.embeddings.tf_strategy.BaseTFTransformStrategy(categoricals, numericals, emb_size_strategy='max50')
assert strat.categoricals == categoricals
assert strat.numericals == numericals
assert type(strat.get_encoding_layers()['fee']) == col.embeddings.layers.TFNormalizationLayer

# test TFCatEmbsEncoder
encoder = col.embeddings.models.TFCatEmbsEncoder(tf_strategy=strat)
encoder.adapt(dataset)
sample = dataset.take(1)
assert encoder.predict(sample).shape == (32,52)

# test TFCatEmbsClassifier
encoder = col.embeddings.models.TFCatEmbsEncoder(tf_strategy=strat)
clf = col.embeddings.models.TFCatEmbsClassifier(encoder=encoder)
clf.adapt(dataset)

clf.compile(optimizer='adam', loss=keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"])
hist = clf.fit(dataset)
assert clf.get_weights()[0].shape == (169, 50)
assert hist.history['accuracy'][0] > 0.6

2022-02-18 08:53:22.917708: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:23.332494: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:23.718153: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:23.732725: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:25.025222: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:25.103095: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:53:25.679573: I tensorflow/core/grappler/optimizers/cust



## test embeddings.wrapper

In [199]:
m = col.embeddings.wrapper.MonoEmbeddings('max2')
X_train_ = X_train[numericals + categoricals]

X_train_transformed = m.fit_transform(X_train_, y_train, categoricals, epochs=1)
assert X_train_transformed.shape == (11994, 3)
assert m.predict_class_from_df(X_train_).shape == (11994, 1)

2022-02-18 09:07:49.370039: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:07:49.741734: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:07:50.117889: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:07:50.128392: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:07:51.166813: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-02-18 09:07:57.480399: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:07:57.963820: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [186]:
m = col.embeddings.wrapper.MonoEmbeddings('max2')

_ = m.fit(X_train, y_train, feature_selection.categoricals, epochs=3)
score = scorer.score(y_test, m.predict_class_from_df(X_test))
assert score['auc'] > .8

2022-02-17 23:34:47.036011: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:47.466453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:48.045853: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:48.689224: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:49.305596: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:49.944175: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-17 23:34:50.553039: I tensorflow/core/grappler/optimizers/cust

Epoch 1/3


2022-02-17 23:35:01.546382: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


2022-02-17 23:36:53.743729: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


{'acc': 0.7719239746582194, 'f1': 0.8395872420262664, 'auc': 0.821264291705293}

## testing integration with SimpleCompositeTransformer

In [189]:
transformer = col.transform.composite.SimpleCompositeTransformer(cat_transformer=col.embeddings.MonoEmbeddings('max2'), 
                                                                 features=feature_selection)

transformer.fit(X_train, y_train)
assert transformer.transform(X_test).shape == (2999, 36), "transformed shape does not match the transform strategy"

2022-02-18 08:35:21.999974: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:22.441996: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:22.879231: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:23.309413: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:23.770583: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:24.234251: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 08:35:24.681238: I tensorflow/core/grappler/optimizers/cust

Epoch 1/3


2022-02-18 08:35:35.544435: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


SimpleComposite_MonoEmbeddings_Max2Strategy()

## Testing integration with benchmarks

In [201]:
transformers = [col.embeddings.MonoEmbeddings('max2')]

classifiers = [RandomForestClassifier(max_depth=10)]

benchmark = col.benchmark.BenchmarkRunner(
    features=feature_selection,
    cat_transformers=transformers,
    classifiers=classifiers,
    scorer=scorer
)

for _ in range(3):
    idx_train = X_train.sample(10000).index
    benchmark.run(X_train.loc[idx_train,:], y_train.loc[idx_train], X_test, y_test)
    
reporter = benchmark.create_reporter()
reporter.show().head(3)

2022-02-18 09:12:55.565078: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:55.924036: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:56.270204: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:56.621186: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:56.967687: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:57.315306: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:12:57.658439: I tensorflow/core/grappler/optimizers/cust

Epoch 1/3


2022-02-18 09:13:05.195763: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


2022-02-18 09:14:30.804762: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:32.859702: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:33.196356: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:33.533876: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:33.866564: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:34.201463: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:14:34.531272: I tensorflow/core/grappler/optimizers/cust

Epoch 1/3


2022-02-18 09:14:42.020731: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


2022-02-18 09:16:13.382091: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:15.437890: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:15.775265: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:16.107017: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:16.439353: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:16.771477: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-18 09:16:17.101863: I tensorflow/core/grappler/optimizers/cust

Epoch 1/3


2022-02-18 09:16:24.703361: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


2022-02-18 09:18:11.033446: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Unnamed: 0,classifier,transformer,acc,f1,auc
0,RandomForestClassifier(max_depth=10),SimpleComposite_MonoEmbeddings_Max2Strategy(),0.768923,0.845272,0.809501
