kmodel

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.

Installation

pip install kmodel

Quick start

The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.

01 ML

from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml

from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns

df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape

(342, 10)

# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])

StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3

(228, 114)

# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((228, 4), (228, 3), (114, 4), (114, 3))

# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()

	species_Adelie	species_Chinstrap	species_Gentoo	nfold
0	0.993427	0.137000	-0.130427	0
1	0.790344	0.103762	0.105894	1
2	0.673088	0.317647	0.009265	2
3	1.064457	0.046586	-0.111043	0
4	1.122991	0.154406	-0.277398	1

# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	1.211930e-01	8.846216e-09
3	0.958070	4.192990e-02	9.000554e-09
9	0.839056	1.188384e-01	4.210543e-02
11	0.612601	3.873988e-01	9.149350e-09
14	0.978535	9.311731e-09	2.146502e-02

# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	0.121193	8.846216e-09
1	0.790344	0.103762	1.058942e-01
2	0.673088	0.317647	9.264531e-03
3	0.958070	0.041930	9.000554e-09
4	0.879124	0.120876	7.828416e-09

# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

02 DNN

from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv

import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader

# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape

(342, 10)

# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape

torch.Size([8, 3])

# Weight-normalized linear block.
lin_wn(10, 3)

Sequential(
  (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): ParametrizedLinear(
    in_features=10, out_features=3, bias=True
    (parametrizations): ModuleDict(
      (weight): ParametrizationList(
        (0): _WeightNorm()
      )
    )
  )
  (3): SiLU()
)

# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape

torch.Size([8, 3])

# Cross-entropy with soft labels.
CE(logits, yb)

tensor(1.0681, grad_fn=<MeanBackward0>)

# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)

tensor(1.0681, grad_fn=<MeanBackward0>)

# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)

tensor(0.3023, grad_fn=<MeanBackward0>)

# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
    df,
    feat_col,
    target_col,
    split0,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
    save='model',
)
pred.head()

# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
    df.iloc[split0[1]].copy(),
    feat_col,
    target_col,
    model_func=get_mlp,
    model_pth='model',
    A=n_aa,
)
test_pred

# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
    df,
    feat_col,
    target_col,
    splits=splits,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
)
oof.nfold.value_counts()

Name		Name	Last commit message	Last commit date
Latest commit History 11 Commits
.github/workflows		.github/workflows
kmodel		kmodel
models		models
nbs		nbs
.gitignore		.gitignore
LICENSE		LICENSE
MANIFEST.in		MANIFEST.in
README.md		README.md
pyproject.toml		pyproject.toml
uv.lock		uv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

kmodel

Installation

Quick start

01 ML

02 DNN

About

Uh oh!

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

kmodel

Installation

Quick start

01 ML

02 DNN

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages