In [None]:
#|hide
#| eval: false
! [ -e /content ] && pip install -Uqq fastai  # upgrade fastai on colab

In [None]:
#|export
from __future__ import annotations
from fastai.basics import *
from fastai.tabular.core import *
from fastai.tabular.model import *
from fastai.tabular.data import *
from fastai.tabular.learner import *
from bigtabular.core import *
from bigtabular.data import *
import dask.dataframe as dd

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|default_exp learner

# BigTabular learner

> The function to immediately get a `Learner` ready to train for tabular data with Dask

The main function you probably want to use in this module is `dask_learner`. It will automatically create a `TabularModel` suitable for your data and infer the right loss function. See the [BigTabular tutorial](tutorial.html) for an example of use in context.

## Main functions

In [None]:
#|export
class DaskLearner(TabularLearner):
    "`Learner` for tabular data in Dask"
    def get_preds(self,
        ds_idx:int=1, # `DataLoader` to use for predictions if `dl` is None. 0: train. 1: valid
        dl=None, # `DataLoader` to use for predictions, defaults to `ds_idx=1` if None
        with_input:bool=False, # Return inputs with predictions
        with_decoded:bool=False, # Return decoded predictions
        with_loss:bool=False, # Return per item loss with predictions
        act=None, # Apply activation to predictions, defaults to `self.loss_func`'s activation
        inner:bool=False, # If False, create progress bar, show logger, use temporary `cbs`
        cbs:Callback|MutableSequence|None=None, # Temporary `Callback`s to apply during prediction
        **kwargs
    )-> tuple:
        return super().get_preds(
            ds_idx=ds_idx, dl=dl, with_input=with_input, with_decoded=with_decoded, with_loss=with_loss,
            act=act, inner=inner, cbs=cbs, reorder=False, **kwargs
        )

    def show_results(self, ds_idx=1, dl=None, max_n=9, **kwargs):
        return super().show_results(ds_idx=ds_idx, dl=dl, max_n=max_n, shuffle=False, **kwargs)

    def predict(self, 
        row:pd.Series, # Features to be predicted
    ):
        "Predict on a single sample"
        row = row.to_frame().T
        row[list(self.dls.cont_names)] = row[list(self.dls.cont_names)].astype(np.float32)
        dl = self.dls.test_dl(dd.from_pandas(row))
        inp, preds, _, dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
        df = self.dls.show_batch(inp, max_n=1, show=False)
        full_dec = self.dls.decode(df)
        return full_dec,dec_preds[0],preds[0]

In [None]:
show_doc(DaskLearner, title_level=3)

---

[source](https://github.com/stefan027/bigtabular/blob/main/bigtabular/learner.py#L18){target="_blank" style="float:right; font-size:smaller"}

### DaskLearner



*`Learner` for tabular data in Dask*

`DaskLearner` inherits from fast.ai's `TabularLearner`. It works exactly as a normal `Learner`, the only difference is that it implements a `predict` method specific to work on a row of data.

In [None]:
#|export
@delegates(TabularLearner.__init__)
def dask_learner(
        dls:DataLoaders,
        layers:list=None, # Size of the layers generated by `LinBnDrop`
        emb_szs:list=None, # Tuples of `n_unique, embedding_size` for all categorical features
        config:dict=None, # Config params for TabularModel from `tabular_config`
        n_out:int=None, # Final output size of the model
        y_range:Tuple=None, # Low and high for the final sigmoid function
        **kwargs
):
    "Get a `Learner` using `dls`, with `metrics`, including a `TabularModel` created using the remaining params."
    if config is None: config = tabular_config()
    if layers is None: layers = [200,100]
    to = dls.train_ds
    emb_szs = get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs)
    if n_out is None: n_out = get_c(dls)
    assert n_out, "`n_out` is not defined, and could not be inferred from data, set `dls.c` or pass `n_out`"
    if y_range is None and 'y_range' in config: y_range = config.pop('y_range')
    model = TabularModel(emb_szs, len(dls.cont_names), n_out, layers, y_range=y_range, **config)
    return DaskLearner(dls, model, **kwargs)

If your data was built with fastai, you probably won't need to pass anything to `emb_szs` unless you want to change the default of the library (produced by `get_emb_sz`), same for `n_out` which should be automatically inferred. `layers` will default to `[200,100]` and is passed to `TabularModel` along with the `config`.

Use `tabular_config` to create a `config` and customize the model used. There is just easy access to `y_range` because this argument is often used.

All the other arguments are passed to `Learner`.

The following function gives the same result as ``valid_idx=list(range(800,1000))`` in TabularDataLoaders. This is only the cases for a Dask dataframe with one partition.

In [None]:
def split_func(df): return pd.Series([False if i >= 800 and i < 1000 else True for i in range(len(df))])

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
ddf = dd.from_pandas(pd.read_csv(path/'adult.csv'))
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [DaskCategorify, DaskFillMissing, DaskNormalize]
dls = DaskDataLoaders.from_ddf(ddf, path, procs=procs, cat_names=cat_names, cont_names=cont_names,
                              y_names="salary", train_mask_func=split_func, bs=64)
learn = dask_learner(dls)



In [None]:
show_doc(DaskLearner.predict)

---

[source](https://github.com/stefan027/bigtabular/blob/main/bigtabular/learner.py#L39){target="_blank" style="float:right; font-size:smaller"}

### DaskLearner.predict

>      DaskLearner.predict (row:pandas.core.series.Series)

*Predict on a single sample*

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| row | pd.Series | Features to be predicted |

We can pass in an individual row of data into our `TabularLearner`'s `predict` method. It's output is slightly different from the other `predict` methods, as this one will always return the input as well:

In [None]:
row, clas, probs = learn.predict(ddf.head().iloc[0])

In [None]:
row

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,Private,Assoc-acdm,Married-civ-spouse,#na#,Wife,White,False,49.0,101320.001686,12.0


In [None]:
clas, probs

(tensor(1), tensor([0.4863, 0.5137]))

In [None]:
#|hide
#test y_range is passed
learn = tabular_learner(dls, y_range=(0,32))
assert isinstance(learn.model.layers[-1], SigmoidRange)
test_eq(learn.model.layers[-1].low, 0)
test_eq(learn.model.layers[-1].high, 32)

learn = tabular_learner(dls, config = tabular_config(y_range=(0,32)))
assert isinstance(learn.model.layers[-1], SigmoidRange)
test_eq(learn.model.layers[-1].low, 0)
test_eq(learn.model.layers[-1].high, 32)

## Export -

In [None]:
#|hide
from nbdev import nbdev_export
nbdev_export()