In [149]:
import sys

import pandas as pd
import numpy as np
import scipy
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

sys.path.append('../src')
import columnar as col

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# notebook to develop and test the transform module within columnar
composite transforms allow to generalize the transformation strategy by defining to define which MonoTransformer is applied to each column.

## load dataset

In [127]:
df = col.loaders.petfinder._load('../')
feature_selection = col.FeatureSelection(**col.loaders.petfinder._select_features(df))

# split data
df_train, df_test = train_test_split(df, test_size=.2, random_state=123)

X_train, y_train = feature_selection.select_features(df_train)
ds_train = col.embeddings.data.df_to_dataset(X_train, y_train, batch_size=32)

X_test, y_test = feature_selection.select_features(df_test)
ds_test = col.embeddings.data.df_to_dataset(X_test, y_test, batch_size=32, shuffle=False)

# setup scorer
scorer = col.Scorer(
    acc=lambda ytrue, ypreds: metrics.accuracy_score(ytrue, ypreds > .5),
    f1=lambda ytrue, ypreds: metrics.f1_score(ytrue, ypreds > .5),
    auc=metrics.roc_auc_score,
)

## Test transform.mono

In [197]:
# test MTE transformer
mte = col.transform.mono.MeanTargetEncoder(alpha=5)
X_train_ = mte.fit_transform(X_train, y_train, feature_selection.categoricals)

assert X_train_.shape == (11994, 17)
assert str(X_train_.dtypes.iloc[5]) == 'float64'


# test passthrough transformer
passthrough = col.transform.mono.PassThrough()
X_train_ = passthrough.fit_transform(X_train, y_train, feature_selection.numericals)
assert list(X_train_.columns) == ['photoamt', 'videoamt', 'fee', 'age']


ohe = col.transform.mono.MonoFromSklearn(OneHotEncoder())
X_train_ = ohe.fit_transform(X_train, y_train, feature_selection.numericals)
assert type(X_train_) == scipy.sparse.csr.csr_matrix
assert X_train_.shape == (11994, 209)

## Test transform.strategy

In [198]:
# initializing components
mte = col.transform.mono.MeanTargetEncoder(alpha=5)
passthrough = col.transform.mono.PassThrough()

# testing TransfomerStrategy
strategy = col.transform.strategy.TransformerStrategy.from_tuples(
    ('cats', mte, feature_selection.categoricals),
    ('nums', passthrough, feature_selection.numericals))

assert len(strategy.transformations) == 2
assert len(strategy.mapping) == 21
assert strategy.mapping['breed1'] == 'MeanTargetEncoder(alpha=5)'

## Test transform.composite

In [201]:
# CompositeTransformer
transformer = col.transform.composite.CompositeTransformer(strategy)
X_train_ = transformer.fit_transform(X_train, y_train)
assert X_train_.shape[1] == 21
assert list(X_train_.columns) == ['type_', 'breed1_', 'breed2_', 'gender_', 'color1_', 'color2_',
       'color3_', 'maturitysize_', 'furlength_', 'vaccinated_', 'dewormed_',
       'sterilized_', 'health_', 'quantity_', 'state_', 'rescuerid_',
       'has_name_', 'photoamt', 'videoamt', 'fee', 'age']

# CategoricalCompositeTransformer: instantiate a categorical > MTE / numericals > passthrough strategy
transformer = col.transform.composite.SimpleCompositeTransformer(mte, feature_selection)
X_train_ = transformer.fit_transform(X_train, y_train)

assert X_train_.shape[1] == 21
assert type(X_train_) == pd.DataFrame
assert list(X_train_.columns) == ['type_', 'breed1_', 'breed2_', 'gender_', 'color1_', 'color2_',
       'color3_', 'maturitysize_', 'furlength_', 'vaccinated_', 'dewormed_',
       'sterilized_', 'health_', 'quantity_', 'state_', 'rescuerid_',
       'has_name_', 'photoamt', 'videoamt', 'fee', 'age']

# validating that CategoricalCompositeTransformer works with sparse matrix
ohe = col.transform.mono.MonoFromSklearn(OneHotEncoder()) # generates sparse output
transformer = col.transform.composite.SimpleCompositeTransformer(ohe, feature_selection)
X_train_ = transformer.fit_transform(X_train, y_train)
assert X_train_.shape == (11994, 5198)
assert type(X_train_) == sparse.csr_matrix

# validating that CategoricalCompositeTransformer works with mix of dataframes and np.arrays
scaler = col.transform.mono.MonoFromSklearn(MaxAbsScaler()) # generates numpy arrays as output
scaler_output = scaler.fit_transform(X_train, y_train, feature_selection.numericals)
assert type(scaler_output) == np.ndarray

strategy = col.transform.strategy.TransformerStrategy.from_tuples(
    ('cats', mte, feature_selection.categoricals),
    ('nums', scaler, feature_selection.numericals))

transformer = col.transform.composite.CompositeTransformer(strategy)
X_train_ = transformer.fit_transform(X_train, y_train)
assert type(X_train_) == np.ndarray

## Testing MonoEmbeddings
MonoEmbeddings is performed separately in the [test_embeddings.ipynb](./test_embeddings.ipynb) notebook

## Testing integration with benchmark

In [202]:
transformers = [
    col.transform.mono.MeanTargetEncoder(),
    col.transform.mono.MonoFromSklearn(OneHotEncoder(handle_unknown='ignore')),
    col.transform.mono.MonoFromSklearn(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
]

classifiers = [LogisticRegression(max_iter=500), RandomForestClassifier(max_depth=10)]

benchmark = col.benchmark.BenchmarkRunner(
    features=feature_selection,
    cat_transformers=transformers,
    classifiers=classifiers,
    scorer=scorer
)

for _ in range(3):
    idx_train = X_train.sample(10000).index
    benchmark.run(X_train.loc[idx_train,:], y_train.loc[idx_train], X_test, y_test)
    
reporter = benchmark.create_reporter()
reporter.show().head(3)

Unnamed: 0,classifier,transformer,acc,f1,auc
0,LogisticRegression(max_iter=500),SimpleComposite_MeanTargetEncoder(alpha=5),0.755252,0.844074,0.789692
1,RandomForestClassifier(max_depth=10),SimpleComposite_MeanTargetEncoder(alpha=5),0.765366,0.850888,0.81082
2,LogisticRegression(max_iter=500),SimpleComposite_Mono_OneHotEncoder(handle_unkn...,0.771702,0.853993,0.801114
