kmodel provides reusable machine learning and deep learning helpers for
multi-output modeling workflows. It covers tabular model training,
scoring, post-processing, prediction, and fastai-based deep learning
utilities through runnable examples derived from the project notebooks.
The examples below follow the notebooks under nbs/ in order. Each
function example lives in its own cell and starts with a short comment
derived from the function docstring.
from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
df = sns.load_dataset("penguins").dropna(
subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)
# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))
# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
species_Adelie |
species_Chinstrap |
species_Gentoo |
| 0 |
0.993427 |
0.137000 |
-0.130427 |
| 3 |
1.064457 |
0.046586 |
-0.111043 |
| 9 |
0.839056 |
0.118838 |
0.042105 |
| 11 |
0.669557 |
0.423417 |
-0.092974 |
| 14 |
1.050863 |
-0.073914 |
0.023052 |
# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
species_Adelie |
species_Chinstrap |
species_Gentoo |
nfold |
| 0 |
0.993427 |
0.137000 |
-0.130427 |
0 |
| 1 |
0.790344 |
0.103762 |
0.105894 |
1 |
| 2 |
0.673088 |
0.317647 |
0.009265 |
2 |
| 3 |
1.064457 |
0.046586 |
-0.111043 |
0 |
| 4 |
1.122991 |
0.154406 |
-0.277398 |
1 |
# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
species_Adelie |
species_Chinstrap |
species_Gentoo |
| 0 |
0.878807 |
1.211930e-01 |
8.846216e-09 |
| 3 |
0.958070 |
4.192990e-02 |
9.000554e-09 |
| 9 |
0.839056 |
1.188384e-01 |
4.210543e-02 |
| 11 |
0.612601 |
3.873988e-01 |
9.149350e-09 |
| 14 |
0.978535 |
9.311731e-09 |
2.146502e-02 |
# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
species_Adelie |
species_Chinstrap |
species_Gentoo |
| 0 |
0.878807 |
0.121193 |
8.846216e-09 |
| 1 |
0.790344 |
0.103762 |
1.058942e-01 |
| 2 |
0.673088 |
0.317647 |
9.264531e-03 |
| 3 |
0.958070 |
0.041930 |
9.000554e-09 |
| 4 |
0.879124 |
0.120876 |
7.828416e-09 |
# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
species_Adelie |
species_Chinstrap |
species_Gentoo |
| 0 |
0.993427 |
0.137000 |
-0.130427 |
| 3 |
1.064457 |
0.046586 |
-0.111043 |
| 9 |
0.839056 |
0.118838 |
0.042105 |
| 11 |
0.669557 |
0.423417 |
-0.092974 |
| 14 |
1.050863 |
-0.073914 |
0.023052 |
from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv
import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape
# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape
# Weight-normalized linear block.
lin_wn(10, 3)
Sequential(
(0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): Dropout(p=0.1, inplace=False)
(2): ParametrizedLinear(
in_features=10, out_features=3, bias=True
(parametrizations): ModuleDict(
(weight): ParametrizationList(
(0): _WeightNorm()
)
)
)
(3): SiLU()
)
# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape
# Cross-entropy with soft labels.
CE(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)
tensor(0.3023, grad_fn=<MeanBackward0>)
# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
df,
feat_col,
target_col,
split0,
model_func=get_mlp,
A=n_aa,
n_epoch=1,
bs=16,
lr=3e-3,
save='model',
)
pred.head()
# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
df.iloc[split0[1]].copy(),
feat_col,
target_col,
model_func=get_mlp,
model_pth='model',
A=n_aa,
)
test_pred
# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
df,
feat_col,
target_col,
splits=splits,
model_func=get_mlp,
A=n_aa,
n_epoch=1,
bs=16,
lr=3e-3,
)
oof.nfold.value_counts()