# Synopsis

Study of a variety of FE methods with Logistic Regression and LightGBM

In this study we use a variety of feature engineering methods and check their effect on untuned logistic regression and boosting models.  We include both standard and experimental methods.

The one library that is not pre-installed is [kaggler](https://github.com/jeongyoonlee/Kaggler) by Jeong Yoon Lee.

# Outline

* **[Setup](#Setup)** - imports and variables
* **[Classes](#Classes)** - Custom classes used
* **[Functions](#Functions)** - Custom functions used
* **[Baselines](#Baselines)** - Various baseline to evaluate our results
    * **[Equal Classes](#Equal-Classes)**
    * **[Weighted Classes](#Weighted-Classes)**
    * **[Unmodified Features](#Unmodified-Features)**
* **[Decomposition](#Decomposition)** - Tests of Decomposition Methods
    * **[PCA](#PCA)**
    * **[Factor Analysis](#Factor-Analysis)**
    * **[Factor Analysis Rotated](#Factor-Analysis-Rotated)**
    * **[Fast ICA](#Fast-ICA)**
* **[Supervised DAE](#Supervised-DAE)** - Test of Supervised DAE from kaggler
* **[Target Encoding](#Target-Encoding)**
* **[Numeric Binning](#Numeric-Binning)** - Experimental
* **[One-Hot Encoding](#One-Hot-Encoding)**
* **[Rotate Features and Values](#Rotate-Features-and-Values)** - Experimental
* **[Positive Encoding](#Positive-Encoding)** - Experimental
* **[Positive Encoding Plus Original](#Positive-Encoding-Plus-Original)** - Experimental
* **[Embedding](#Embedding)**
* **[Summary](#Summary)**



# Setup

In [None]:
import copy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.io.formats import style

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import sklearn.preprocessing as sk_prep
import sklearn.model_selection as sk_ms
import sklearn.feature_selection as sk_fs
import sklearn.pipeline as sk_pipe
import sklearn.compose as sk_comp
import sklearn.base as sk_base
import sklearn.ensemble as sk_ens
import sklearn.metrics as sk_met
import sklearn.linear_model as sk_lm
import sklearn.tree as sk_tree
import sklearn.svm as sk_svm
import sklearn.decomposition as sk_de
import category_encoders as ce

from scipy import stats

import lightgbm as lgbm

In [None]:
!pip install kaggler

In [None]:
import kaggler
import kaggler.preprocessing as kag_prep
print(kaggler.__version__)

In [None]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
DATA_DIR = '/kaggle/input/tabular-playground-series-jun-2021'
RANDOM_STATE = 9003

# Classes

Classes used for feature engineering

In [None]:
class NumCategorizer(sk_base.TransformerMixin, sk_base.BaseEstimator):
    """
    Transform numeric features into ordered categorical features based on
    splits determined by a Decision Tree.
    This is coded as a Scikit-Learn Transformer.
    """
    
    
    def __init__(self, min_samples_leaf=1000, min_impurity_decrease=0.0001, random_state=1):
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.random_state = random_state
        
        
    def fit(self, X, y):
        model1 = sk_tree.DecisionTreeClassifier(
            min_samples_leaf=self.min_samples_leaf,
            min_impurity_decrease=self.min_impurity_decrease,
            random_state=self.random_state
        )

        bin_boundaries = {}

        for col in X:
            model1.fit(X.loc[:, [col]], y)
            tree = model1.tree_

            bounds = tree.threshold
            bounds = np.sort(np.unique(bounds))
            bounds[0] = -np.Inf
            bounds = np.append(bounds, [np.Inf])
            bin_boundaries[col] = bounds

        self.bin_boundaries_ = bin_boundaries
        self.names_ = X.columns
    
        return self
    
    
    def transform(self, X, y=None):
        tr_trans = pd.DataFrame(index=X.index)

        for col in X:
            bounds = self.bin_boundaries_[col]
            tr_trans[col] = pd.cut(X[col], bins=bounds, labels=range(len(bounds) - 1))
            
        return tr_trans

    
    def get_feature_names(self):
        return self.names_

In [None]:
class MyPolynomialWrapper(sk_base.TransformerMixin, sk_base.BaseEstimator):
    """
    This is like Scikit-Learn's PolynomialWrapper, but I had inconsistent results
    with that class so I have made my own.
    """
    
    
    def __init__(self, encoder):
        self.encoder = encoder
    
    
    def fit(self, X, y):
        target_cats = np.sort(np.unique(y))
#         target_cats = target_cats[:-1] # remove last class
        self.target_cats = target_cats
        encoder = copy.deepcopy(self.encoder)
        
        enc_list = []
        for tcat in target_cats:
            encoder.fit(X, (y == tcat).astype(int))
            enc_list.append(encoder)
        
        self.enc_list = enc_list
            
        return self
    
    
    def transform(self, X, y=None):
        trans_list = []
        for i in range(len(self.target_cats)):
            tcat = self.target_cats[i]
            encoder = self.enc_list[i]

            X_e = encoder.transform(X)
            
            new_cols = [str(tcat) + '_' + str(col) for col in X_e]
            X_e.columns = new_cols
            trans_list.append(X_e)
            
        return pd.concat(trans_list, axis=1)

In [None]:
class EmbeddingModel(nn.Module):
    """
    This is a simple pytorch NN for the purpose of training our embedding.
    """
    
    
    def __init__(self, input_node_cnt: int, output_node_cnt: int = 1, drop_rate: float = 0.0, embeddings=100, embedding_dim=3):
        super(EmbeddingModel, self).__init__()
        self.embed1 = nn.Embedding(embeddings, embedding_dim)
        self.lin1 = nn.Linear(input_node_cnt * embedding_dim, 100)
        self.lin2 = nn.Linear(100, 50)
        self.out = nn.Linear(50, output_node_cnt)
        self.drop_rate = drop_rate
        self.dropout = nn.Dropout(drop_rate)
    
    def forward(self, input):
        z = nn.Flatten()(self.embed1(input))
        z = self.dropout(z)
        z = self.dropout(nn.ReLU()(self.lin1(z)))
        z = self.dropout(nn.ReLU()(self.lin2(z)))
        output = self.out(z)
        
        return output
        
    def encode(self, input):
        z = nn.Flatten()(self.embed1(input))
        output = self.dropout(z)
        return output
        

# Functions

Functions for the NN training used for feature engineering

In [None]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    """
    This is a basic function from pytorch examples to handle the updates for one batch or get the loss for evaluation.
    In evaluation mode, no optimizer should be passed to the function.
    """
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item()

In [None]:
def nn_train(X, y, model, epochs):
    """
    This is a basic function to handle training of our pytorch model.
    """
    
    dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(dev)
    
    loss_func = nn.CrossEntropyLoss()
    opt = optim.SGD(model.parameters(),
                lr=0.2,
                momentum=0.9)
    
    X_train = torch.tensor(X, dtype=torch.long).to(dev)
    y_train = torch.tensor(y, dtype=torch.long).to(dev)
    
    model.train()
    
    for epoch in range(epochs):
        loss1 = loss_batch(model, loss_func, X_train, y_train, opt)
        
        print(f'Epoch: {epoch}  Loss: {loss1}')
    
    model.eval()
    
    return loss1 # Final training loss

# Load Data

In [None]:
train_set = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
train_data = train_set.iloc[:, 1:-1] # Feature columns
train_ar = train_data.to_numpy()

le_targ = sk_prep.LabelEncoder()
train_y = train_set['target']
train_y_num = le_targ.fit_transform(train_y)
classes = le_targ.classes_

# Train data with label encoded target
train_w_targ = train_data.copy()
train_w_targ['target'] = train_y_num

print(train_set.shape)
train_set

In [None]:
# Separate train / validation split for initial tests

# I have chosen not to use a stratified split here.

train_tv_X, val_tv_X, train_tv_y, val_tv_y = sk_ms.train_test_split(train_data, train_y, test_size=0.3, random_state=RANDOM_STATE)

In [None]:
loss_list_1 = []

# Baselines

## Equal Classes

The most basic prediction is the expectation that all classes have an equal chance for all samples.  This only uses the classes of y and does not require any training data.  This sets a distant outer limit for our scores.

In [None]:
pred_1 = 1.0 / len(classes)

y_hat = pd.DataFrame(columns=classes, index=val_tv_y.index)
y_hat.iloc[:, :] = pred_1

eq_loss = sk_met.log_loss(val_tv_y, y_hat)
eq_loss

## Weighted Classes

This goes one step further and uses the frequency of the classes in y in the training data.  It still does not use any features in the data.  Any model with a score close to this is not profiting at all from the features that are passed to it.

In [None]:
y_hat = pd.DataFrame(columns=classes, index=val_tv_y.index)

y_hat_vals = train_tv_y.value_counts().sort_index().T / len(train_tv_y)
y_hat.iloc[:, :] = y_hat_vals

wt_loss = sk_met.log_loss(val_tv_y, y_hat)
wt_loss

## Unmodified Features

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(train_tv_X, train_tv_y)

y_hat = model.predict_proba(val_tv_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(train_tv_X, train_tv_y)

y_hat = model.predict_proba(val_tv_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Original', lr_loss, lgbm_loss])

# Decomposition

## PCA

Although there is not a lot of correlation between features, we will check if PCA makes the data easier for either of the two test models to handle.

In [None]:
# Scaling

scaler_in = sk_prep.StandardScaler()
tr1_X = scaler_in.fit_transform(train_tv_X)
val1_X = scaler_in.transform(val_tv_X)

# PCA

pca_in = sk_de.PCA(random_state=RANDOM_STATE)
tr2_X = pca_in.fit_transform(tr1_X)
val2_X = pca_in.transform(val1_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)


y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['PCA', lr_loss, lgbm_loss])

## Factor Analysis

In [None]:
# Scaling

scaler_in = sk_prep.StandardScaler()
tr1_X = scaler_in.fit_transform(train_tv_X)
val1_X = scaler_in.transform(val_tv_X)

# FA

fa_in = sk_de.FactorAnalysis(random_state=RANDOM_STATE)
tr2_X = fa_in.fit_transform(tr1_X)
val2_X = fa_in.transform(val1_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)


y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['FA', lr_loss, lgbm_loss])

We could add tests to run with different number of components--sometimes FA works better when only the first set of components are used.

## Factor Analysis Rotated

In [None]:
# Scaling

scaler_in = sk_prep.StandardScaler()
tr1_X = scaler_in.fit_transform(train_tv_X)
val1_X = scaler_in.transform(val_tv_X)

# FA

fa_in = sk_de.FactorAnalysis(random_state=RANDOM_STATE, rotation='varimax')
tr2_X = fa_in.fit_transform(tr1_X)
val2_X = fa_in.transform(val1_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)


y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['FA-Rot', lr_loss, lgbm_loss])

## Fast ICA

In [None]:
# # Scaling

# scaler_in = sk_prep.StandardScaler()
# tr1_X = scaler_in.fit_transform(train_tv_X)
# val1_X = scaler_in.transform(val_tv_X)

# FastICA

fa_in = sk_de.FastICA(random_state=RANDOM_STATE)
tr2_X = fa_in.fit_transform(train_tv_X)
val2_X = fa_in.transform(val_tv_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)


y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['FastICA', lr_loss, lgbm_loss])

None of these decompostion methods have helped logistic regression, but Fast ICA made it much worse--not much better than weighted classes w/o features.  Boosting was not affected much either way by any of these.  PCA or FA might be better using a smaller set of their outputs; this is especially true for FA.

# Supervised DAE

This uses libraries from kaggler and the example at https://www.kaggle.com/jeongyoonlee/tps-6-supervised-dae-keras-gpu

In [None]:
%%time

# Train SDAE on train data of split

train_tv_y_num = le_targ.transform(train_tv_y)

sdae = kag_prep.SDAE(cat_cols=train_data.columns.tolist(), encoding_dim=256, n_layer=1, noise_std=.001, batch_size=65536,
            n_epoch=10, random_state=RANDOM_STATE)
sdae.fit(train_tv_X.copy(), train_tv_y_num.copy())

tr2_X = sdae.transform(train_tv_X.copy())
val2_X = sdae.transform(val_tv_X.copy())

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Sup DAE', lr_loss, lgbm_loss])

# Target Encoding

In [None]:
%%time

enc2 = MyPolynomialWrapper(ce.target_encoder.TargetEncoder(cols=list(train_data.columns), smoothing=300))

tr2_X = enc2.fit_transform(train_tv_X, train_tv_y)
val2_X = enc2.transform(val_tv_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Targ Enc', lr_loss, lgbm_loss])

# Numeric Binning

While the concept of binning numeric columns is not new, I did not find previous work that used a decision tree to decide on the best thresholds for the bins.  This method could be further tuned by adjusting the tree parameters.  The class in this notebook allows for adjusting min_samples_leaf and min_impurity_decrease.

In [None]:
%%time

# Binning numbers using a tree classifier (with my own wrapper class)

nc = NumCategorizer(random_state=RANDOM_STATE)

tr1_X = nc.fit_transform(train_tv_X, train_tv_y)
val1_X = nc.transform(val_tv_X)

# One-hot encoding to use the new categories

ohe = sk_prep.OneHotEncoder()
tr2_X = ohe.fit_transform(tr1_X)
val2_X = ohe.transform(val1_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Num Bin', lr_loss, lgbm_loss])

# One-Hot Encoding

In [None]:
# One-hot encoding on original values

ohe = sk_prep.OneHotEncoder(handle_unknown='ignore')
tr2_X = ohe.fit_transform(train_tv_X)
val2_X = ohe.transform(val_tv_X)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['One-Hot', lr_loss, lgbm_loss])

# Rotate Features and Values

I do not know a formal name for this.

Embedding assumes some relationship across all features for a given value.  So the value 5, for example, would be encoded in the same way if it occurred in feature_0, feature_1, etc.  For some types of data, it might be worthwhile to count how often a particular value occurs with no concern for where it occurs.  This is not likely to be the case for this data, but we will take a look.

In [None]:
tr1_ar = train_tv_X.values

max_val = np.max(tr1_ar) # will ignore values above the range of the training set since we wouldn't have any predictions for them.

tr2_ar = np.zeros((tr1_ar.shape[0], max_val + 1))

for i in range(max_val + 1):
    tr2_ar[:, i] = np.sum(tr1_ar == i, axis=1)
    
tr2_X = pd.DataFrame(tr2_ar, columns=np.arange(max_val + 1))
tr2_X

In [None]:
val1_ar = val_tv_X.values

val2_ar = np.zeros((val1_ar.shape[0], max_val + 1))

for i in range(max_val + 1):
    val2_ar[:, i] = np.sum(val1_ar == i, axis=1)
    
val2_X = pd.DataFrame(val2_ar, columns=np.arange(max_val + 1))
val2_X

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Rotate', lr_loss, lgbm_loss])

This method hurt the boosting model, which is no surprise.  It is more interesting that it did not hurt the logistic regression model.

# Positive Encoding

All feature values are simplified to 0 or 1--0 if it was originally 0 and 1 if it was positive. We will call this "positive encoding" in this study.

This was not originally a serious suggestion by itself; the point was to see how much value remained when the data were simplified in this way.

In [None]:
tr2_ar = train_tv_X.values > 0
tr2_X = pd.DataFrame(tr2_ar.astype('int32'), columns=train_data.columns, index=train_tv_X.index)

val2_ar = val_tv_X.values > 0
val2_X = pd.DataFrame(val2_ar.astype('int32'), columns=train_data.columns, index=val_tv_X.index)

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['PosEnc', lr_loss, lgbm_loss])

We see that logistic regression improved in spite of the lost data.  This suggests that zero is not simply one end of the range of numbers, but a special value of its own.  It is remarkable that with this simplification, logistic regression can do as well as boosting did on the original data.  I have looked at this in more detail in another [notebook](https://www.kaggle.com/bruceharold/the-power-of-positive-encoding).

# Positive Encoding Plus Original

In [None]:
tr2_ar = train_tv_X.values > 0
tr1_X = pd.DataFrame(tr2_ar.astype('int32'), columns='pos_' + train_data.columns, index=train_tv_X.index)
tr2_X = pd.concat([train_tv_X, tr1_X], axis=1)

val2_ar = val_tv_X.values > 0
val1_X = pd.DataFrame(val2_ar.astype('int32'), columns='pos_' + train_data.columns, index=val_tv_X.index)
val2_X = pd.concat([val_tv_X, val1_X], axis=1)

tr2_X

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['PosEnc+', lr_loss, lgbm_loss])

Adding in the original data did not improve the scores very much, again underlining how much this dataset depends on the zero vs positive distinction.

# Embedding

Here we use a pytorch neural network to do the original training on the embedding, and then we will use the embedding layer to encode the data for our models.

In [None]:
# Get numeric version of y

tr2_y = le_targ.transform(train_tv_y)
val2_y = le_targ.transform(val_tv_y)

In [None]:
%%time

# Train model with embedding for sake of the embedding layer.

dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

embeddings = np.max(train_tv_X.values) + 1

emb_model = EmbeddingModel(75, 9, drop_rate=0.3, embeddings=embeddings, embedding_dim=4)

nn_train(train_tv_X.values, tr2_y, emb_model, 10)

# Check that model is basically trained.  We are not concerned about a "good" score here, just being in the ballpark.

val_X_ten = torch.from_numpy(val_tv_X.values).to(dev)
y_hat_ten = nn.Softmax(dim=1)(emb_model(val_X_ten))
y_hat = y_hat_ten.detach().cpu().numpy()

sk_met.log_loss(val2_y, y_hat)

In [None]:
tr_X_ten = torch.from_numpy(train_tv_X.values).to(dev)
tr2_X = emb_model.encode(tr_X_ten).detach().cpu().numpy()

val_X_ten = torch.from_numpy(val_tv_X.values).to(dev)
val2_X = emb_model.encode(val_X_ten).detach().cpu().numpy()

In [None]:
%%time

model = sk_lm.LogisticRegression(solver='lbfgs', max_iter=1000)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lr_loss = sk_met.log_loss(val_tv_y, y_hat)
lr_loss

In [None]:
%%time

model = lgbm.LGBMClassifier(random_state=RANDOM_STATE)

model.fit(tr2_X, train_tv_y)

y_hat = model.predict_proba(val2_X)

lgbm_loss = sk_met.log_loss(val_tv_y, y_hat)
lgbm_loss

In [None]:
loss_list_1.append(['Embedding', lr_loss, lgbm_loss])

# Summary

In [None]:
print(f'Equal Class Loss:    {eq_loss}')
print(f'Weighted Class Loss: {wt_loss}')

loss_df = pd.DataFrame(loss_list_1, columns=['Method', 'LR_Loss', 'LGBM_Loss'])
val_min = np.min(loss_df.iloc[:, 1:].values)
val_max = np.max(loss_df.iloc[:, 1:].values)
style.Styler(loss_df, precision=4).background_gradient(cmap='viridis', vmin=val_min, vmax=val_max)

We can see that logistic regression improved from a number of these feature engineering methods.  In most cases where we found improvement, the amount of improvement was very similar.  Boosting received little improvement from any of the methods tried here.

Many of these FE methods can be tweaked, so some that show no gain in this study may be useful if they are handled in a different way.