In [None]:
%reset -f
%load_ext autoreload
%autoreload 2

# filoma.ml — minimal examples
Tiny examples showing filename feature discovery and dataset splitting.

In [None]:
# Run these cells with: PYTHONPATH=./src
import polars as pl

from filoma import ml

## Discover tokens from filenames (separator='_')

In [None]:
df = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df2 = ml.add_filename_features(df, sep="_", prefix=None, include_parent=False, path_col="path")
print(df2.columns)
print(df2)

## Split by single token (token1)

In [None]:
train, val, test = ml.split_data(df2, train_val_test=(60, 20, 20), feature=("token1",), path_col="path", seed=0)
print(len(train), len(val), len(test))
print(train)

## Split by combined features (parent + token2)

In [None]:
df3 = ml.add_filename_features(df, sep="_", prefix=None, include_parent=True, path_col="path")
train, val, test = ml.split_data(df3, train_val_test=(60, 20, 20), feature=("parent", "token2"), path_col="path", seed=0)
print(len(train), len(val), len(test))

## Custom token names and auto names

In [None]:
df4 = ml.add_filename_features(df, sep="_", prefix=None, token_names=["site", "kind", "idx"], path_col="path")
print(df4.columns)
df5 = ml.add_filename_features(df, sep="_", prefix="fn", token_names="auto", path_col="path")
print(df5.columns)

## Include all path parts as features

In [None]:
df6 = ml.add_filename_features(df, sep="_", prefix=None, include_all_parts=True, path_col="path")
print(df6.columns)

## Use a custom path column

If your paths live in a column with a different name (for example `my_path`), pass `path_col` to discovery and splitting functions.

In [None]:
df_custom = pl.DataFrame({"my_path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df_custom2 = ml.add_filename_features(df_custom, sep="_", prefix=None, include_parent=True, include_all_parts=True, path_col="my_path")
print(df_custom2.columns)
print(df_custom2)
train, val, test = ml.split_data(df_custom2, discover=False, feature="path_parts", path_parts=(-1,), path_col="my_path", seed=0)
print(len(train), len(val), len(test))

## Return types: filoma wrapper and pandas
Below are two short examples showing how to request the `filoma.DataFrame` wrapper and a `pandas.DataFrame` from `ml.split_data`. The `pandas` example will fall back with a message if pandas is not installed. Run with `PYTHONPATH=./src`.

In [None]:
# Example: return the filoma.DataFrame wrapper
df7 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df7 = ml.add_filename_features(df7, sep="_", prefix=None, include_parent=True, path_col="path")
train_f, val_f, test_f = ml.split_data(
    df7, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
)
# filoma.DataFrame implements .to_polars() and other helpers
print("train_f type:", type(train_f))
print("train_f is filoma.DataFrame -> to_polars columns:", getattr(train_f, "to_polars")().columns)
print("split sizes:", len(train_f), len(val_f), len(test_f))
print("train_f head:")
print(train_f.head())

In [None]:
# Example: return pandas.DataFrame (if pandas + pyarrow are installed)
# We check for both pandas and pyarrow and show an actionable message if missing.
try:
    import pandas as pd  # noqa: F401
    import pyarrow  # noqa: F401
except ImportError:
    print("pandas or pyarrow not available, skipping pandas example")
    print("Install with: pip install pandas pyarrow")
else:
    df8 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
    df8 = ml.add_filename_features(df8, sep="_", prefix=None, include_parent=True, path_col="path")
    try:
        train_p, val_p, test_p = ml.split_data(
            df8, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
        )
        print("train_p type:", type(train_p))
        print("train_p head:")
        print(train_p.head())
    except Exception as e:
        print("conversion failed:", e)

In [None]:
# Temporary test: create 10 underscored .txt files, run discovery + split_data, then clean up
import shutil
from pathlib import Path

import polars as pl

from filoma import ml

tmp = Path("tests/tmp_ml_files")
if tmp.exists():
    shutil.rmtree(tmp)
# create folders
(tmp / "A").mkdir(parents=True, exist_ok=True)
(tmp / "B" / "C").mkdir(parents=True, exist_ok=True)
(tmp / "D").mkdir(parents=True, exist_ok=True)
(tmp / "E" / "sub").mkdir(parents=True, exist_ok=True)
files = [
    tmp / "A" / "LCFM_20200312_LAPAZ_image_01.txt",
    tmp / "A" / "LCFM_20200312_LAPAZ_image_02.txt",
    tmp / "B" / "OTHER_20210101_SITE_image_01.txt",
    tmp / "B" / "OTHER_20210101_SITE_image_02.txt",
    tmp / "B" / "C" / "MISC_20211111_TEST_doc_001.txt",
    tmp / "B" / "C" / "MISC_20211111_TEST_doc_002.txt",
    tmp / "D" / "EXTRA_FILE_01.txt",
    tmp / "D" / "EXTRA_FILE_02.txt",
    tmp / "E" / "sub" / "DEEP_202001_sample_01.txt",
    tmp / "E" / "sub" / "DEEP_202001_sample_02.txt",
]
for p in files:
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text("test\n")
paths = [str(p) for p in files]
print("created files:", len(paths))
for p in paths:
    print(" -", p)
df = pl.DataFrame({"path": paths})
df2 = ml.add_filename_features(df, sep="_", prefix=None, include_parent=True, path_col="path")
print("Discovered columns:", df2.columns)
train, val, test = ml.split_data(
    df2, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=42, return_type="polars"
)
print("Split sizes:", len(train), len(val), len(test))
print("Train sample:")
print(train)
# cleanup
shutil.rmtree(tmp)
print("cleaned up", tmp)