In [None]:
#| default_exp block

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import numpy as np, re, inspect
from typing import Optional, Dict
from transformers import AutoTokenizer, BatchEncoding

from fastcore.meta import *

from xcai.data import *
from xcai.transform import *
from xcai.data_sampler import *

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

## Config

### `PARAM`

In [None]:
#| export
PARAM = {
    
    # collator arguements
    'tfm': 'xc', 
    'smp_features': [('lbl2data',1,2), ('hlk2data',1,1), ('hlk2lbl2data',2,1)],
    
    # arguements for Info class
    'info_column_names': ['identifier', 'input_text'], 
    'use_tokenizer': True, 
    'tokenizer': 'bert-base-cased',
    'tokenization_column': 'input_text',
    'max_sequence_length': 32,
    
    # PadFeatTfm arguements
    'pad_side': 'right',
    'drop': True,
    'ret_t': True,
    'in_place': True,
    'collapse': True,
    'device': 'cpu',
    
    # AlignInputIdsTfm arguements
    'inp': 'data',
    'targ': 'lbl2data',
    'ptr': 'lbl2data_data2ptr',
    
    # Data arguements
    'n_data_meta_samples': None,
    'n_lbl_meta_samples': None,
    'n_lbl_samples': None,
    
}

### `CONFIGS`

In [None]:
#| export
def wikiseealsotitles(data_dir):
    return {
        'train' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                },
            },
            'parameters': PARAM,
        },
        'data' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_test.txt',
                },
            },
            'parameters': PARAM,
        },
        'train_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/hyper_link.raw.txt'
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_test.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
        'data_metas' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/hyper_link.raw.txt'
                    },
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_test.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/hyper_link.raw.txt',
                    },
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
        'data_catlnk' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt'
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_test.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt',
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_linker' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_train.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/filter_labels_test.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiSeeAlsoTitles-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
    }

In [None]:
#| export
def wikiseealso(data_dir):
    return {
        'train' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                },
            },
            'parameters': PARAM,
        },
        'data' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_test.txt',
                },
            },
            'parameters': PARAM,
        },
        'train_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/hyper_link.raw.txt'
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_test.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
        'data_metas' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/hyper_link.raw.txt'
                    },
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_test.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/hyper_link.raw.txt',
                    },
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
        'data_catlnk' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt'
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_test.txt',
                    'cat_meta': {
                        'prefix': 'cat',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt',
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_linker' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_train.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-WikiSeeAlso-320K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/LF-WikiSeeAlso-320K/filter_labels_test.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-WikiSeeAlso-320K/category_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-WikiSeeAlso-320K/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        }, 
    }

In [None]:
#| export
def wikititles(data_dir):
    return {
        'train' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                },
            },
            'parameters': PARAM,
        },
        'data' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                },
            },
            'parameters': PARAM,
        },
        'train_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt'
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/see_also.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/see_also.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_metas' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt'
                    },
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/see_also.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt',
                    },
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/see_also.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_hlklnk' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt'
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt',
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_linker' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-WikiTitles-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/label.raw.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-WikiTitles-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-WikiTitles-500K/raw_data/hyper_link.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
    }

In [None]:
#| export
def wikipedia(data_dir):
    return {
        'train' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                },
            },
            'parameters': PARAM,
        },
        'data' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                },
            },
            'parameters': PARAM,
        },
        'train_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt'
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/see_also.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/see_also.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_metas' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt'
                    },
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/see_also.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt',
                    },
                    'sal_meta': {
                        'prefix': 'sal',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/see_also_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/see_also.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_hlklnk' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt'
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'hlk_meta': {
                        'prefix': 'hlk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt',
                    },
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_linker' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/trn_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/LF-Wikipedia-500K/tst_X_Y.txt',
                    'data_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/label.raw.txt',
                    'lnk_meta': {
                        'prefix': 'lnk',
                        'data_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/LF-Wikipedia-500K/hyper_link_renee_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/LF-Wikipedia-500K/raw_data/hyper_link.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
    }


In [None]:
#| export
def amazontitles(data_dir):
    return {
        'train' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_train.txt',
                },
            },
            'parameters': PARAM,
        },
        'data' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_train.txt',
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_test.txt',
                },
            },
            'parameters': PARAM,
        },
        'train_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_train.txt',
                    'rel_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/related_items.raw.txt'
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_meta' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_train.txt',
                    'rel_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/related_items.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_test.txt',
                    'rel_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/related_items.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
        'data_metas' : {
            'path': {
                'train': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/trn_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/train.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_train.txt',
                    'rel_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/related_items.raw.txt'
                    },
                    'cat_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/category_trn_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/category.raw.txt'
                    },
                },
                'test': {
                    'data_lbl': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/tst_X_Y.txt',
                    'data_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/test.raw.txt',
                    'lbl_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/label.raw.txt',
                    'data_lbl_filterer': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/filter_labels_test.txt',
                    'rel_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/related_items_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/related_items.raw.txt',
                    },
                    'cat_meta': {
                        'prefix': 'rel',
                        'data_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/category_tst_X_Y.txt',
                        'lbl_meta': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/category_lbl_X_Y.txt',
                        'meta_info': f'{data_dir}/(mapped)LF-AmazonTitles-1.3M/raw_data/category.raw.txt',
                    },
                },
            },
            'parameters': PARAM,
        },
    }

## Block

In [None]:
#| export
CFGS = {'wikiseealsotitles':wikiseealsotitles, 'wikiseealso':wikiseealso, 'wikititles':wikititles, 'wikipedia':wikipedia, 'amazontitles':amazontitles}
TFMS = {
    'xc': [XCPadFeatTfm, AlignInputIdsTfm], 
    'ng': [NGPadFeatTfm], 
    'xcnlg': [XCSamplePadFeatTfm], 
    'rm':[RamenPadFeatTfm],
    'xcs': [XCSamplerFeatTfm],
}

In [None]:
#| export
class XCBlock:

    @delegates(XCDataBlock.from_cfg)
    @classmethod
    def from_cfg(cls, data_dir:str, cfg:str, dset:Optional[str]='wikiseealsotitles', bsz:Optional[int]=10, **kwargs):
        if dset not in CFGS: raise ValueError(f'Invalid `dset`({dset})')
        cfgs = CFGS[dset](data_dir)

        if cfg not in cfgs: raise ValueError(f'Invalid `cfg`({cfg})')
        cfg = cfgs[cfg] 
            
        for k in cfg['parameters']: 
            if k in kwargs and kwargs[k] is not None: cfg['parameters'][k]=kwargs.pop(k)
                
        tokz = AutoTokenizer.from_pretrained(cfg['parameters']['tokenizer'])
        cfg['parameters']['sep_tok'] = tokz.sep_token_id 
        cfg['parameters']['pad_tok'] = tokz.pad_token_id
        cfg['parameters']['batch_size'] = bsz
        
        collator = XCCollator(TfmPipeline([o(**cfg['parameters']) for o in TFMS[cfg['parameters']['tfm']]]))
        
        return XCDataBlock.from_cfg(cfg, collate_fn=collator, **kwargs)


#### Example

In [None]:
data_dir = '/home/aiscuser/scratch/datasets/'

##### `WikiSeeAlso`

In [None]:
block = XCBlock.from_cfg(data_dir, 'train_meta', dset='wikiseealso', tfm='xcnlg', tokenizer='bert-base-uncased')

  self._set_arrayXarray(i, j, x)


In [None]:
b = block.train.one_batch()

In [None]:
b.keys()

dict_keys(['hlk2data_idx', 'phlk2data_idx', 'phlk2data_data2ptr', 'hlk2data_identifier', 'hlk2data_input_text', 'hlk2data_input_ids', 'hlk2data_token_type_ids', 'hlk2data_attention_mask', 'hlk2data_data2ptr', 'hlk2lbl2data_idx', 'phlk2lbl2data_idx', 'phlk2lbl2data_data2ptr', 'hlk2lbl2data_identifier', 'hlk2lbl2data_input_text', 'hlk2lbl2data_input_ids', 'hlk2lbl2data_token_type_ids', 'hlk2lbl2data_attention_mask', 'hlk2lbl2data_data2ptr', 'data_identifier', 'data_input_text', 'data_input_ids', 'data_token_type_ids', 'data_attention_mask', 'lbl2data_idx', 'lbl2data_identifier', 'lbl2data_input_text', 'lbl2data_input_ids', 'lbl2data_token_type_ids', 'lbl2data_attention_mask', 'lbl2data_data2ptr'])

In [None]:
import torch
for k,v in b.items():
    if isinstance(v, torch.Tensor): print(k,':', v.shape)
    else: print(k,':',len(v))

hlk2data_idx : torch.Size([10])
phlk2data_idx : torch.Size([310])
phlk2data_data2ptr : torch.Size([10])
hlk2data_identifier : 10
hlk2data_input_text : 10
hlk2data_input_ids : torch.Size([10, 7])
hlk2data_token_type_ids : torch.Size([10, 7])
hlk2data_attention_mask : torch.Size([10, 7])
hlk2data_data2ptr : torch.Size([10])
hlk2lbl2data_idx : torch.Size([10])
phlk2lbl2data_idx : torch.Size([1368])
phlk2lbl2data_data2ptr : torch.Size([10])
hlk2lbl2data_identifier : 10
hlk2lbl2data_input_text : 10
hlk2lbl2data_input_ids : torch.Size([10, 8])
hlk2lbl2data_token_type_ids : torch.Size([10, 7])
hlk2lbl2data_attention_mask : torch.Size([10, 9])
hlk2lbl2data_data2ptr : torch.Size([10])
data_identifier : 10
data_input_text : 10
data_input_ids : torch.Size([10, 10])
data_token_type_ids : torch.Size([10, 10])
data_attention_mask : torch.Size([10, 10])
lbl2data_idx : torch.Size([27])
lbl2data_identifier : 27
lbl2data_input_text : 27
lbl2data_input_ids : torch.Size([27, 13])
lbl2data_token_type_ids :

In [None]:
b = block.train.dset.one_batch()

In [None]:
b

[{'data_identifier': 'Doral_(cigarette)',
  'data_input_text': 'Doral (cigarette)',
  'data_input_ids': [101, 21008, 2140, 1006, 9907, 1007, 102],
  'data_token_type_ids': [0, 0, 0, 0, 0, 0, 0],
  'data_attention_mask': [1, 1, 1, 1, 1, 1, 1],
  'lbl2data_idx': [23320, 86157],
  'lbl2data_identifier': ['Tobacco_smoking', 'Cigarette'],
  'lbl2data_input_text': ['Tobacco smoking', 'Cigarette'],
  'lbl2data_input_ids': [[101, 9098, 9422, 102], [101, 9907, 102]],
  'lbl2data_token_type_ids': [[0, 0, 0, 0], [0, 0, 0]],
  'lbl2data_attention_mask': [[1, 1, 1, 1], [1, 1, 1]]},
 {'data_identifier': 'Bappi_Lahiri',
  'data_input_text': 'Bappi Lahiri',
  'data_input_ids': [101, 8670, 9397, 2072, 2474, 11961, 2072, 102],
  'data_token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0],
  'data_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
  'lbl2data_idx': [110788],
  'lbl2data_identifier': ['List_of_Indian_film_music_directors'],
  'lbl2data_input_text': ['List of Indian film music directors'],
  'lbl2data_input_i

##### `WikiCategory`

In [None]:
block = XCBlock.from_cfg(data_dir, 'train', dset='wikititles', tfm='ng', tokenizer='bert-base-uncased')

In [None]:
b = block.train.one_batch(); b.keys()

dict_keys(['lbl2data_idx', 'plbl2data_idx', 'plbl2data_data2ptr', 'lbl2data_identifier', 'lbl2data_input_text', 'lbl2data_input_ids', 'lbl2data_token_type_ids', 'lbl2data_attention_mask', 'lbl2data_data2ptr', 'data_identifier', 'data_input_text', 'data_input_ids', 'data_token_type_ids', 'data_attention_mask'])

In [None]:
import torch
for k,v in b.items():
    if isinstance(v, torch.Tensor): print(k,':', v.shape)
    else: print(k,':',len(v))

lbl2data_idx : torch.Size([10])
plbl2data_idx : torch.Size([32])
plbl2data_data2ptr : torch.Size([10])
lbl2data_identifier : 10
lbl2data_input_text : 10
lbl2data_input_ids : torch.Size([10, 9])
lbl2data_token_type_ids : torch.Size([10, 9])
lbl2data_attention_mask : torch.Size([10, 9])
lbl2data_data2ptr : torch.Size([10])
data_identifier : 10
data_input_text : 10
data_input_ids : torch.Size([10, 10])
data_token_type_ids : torch.Size([10, 10])
data_attention_mask : torch.Size([10, 10])


In [None]:
b = block.train.dset.one_batch(); b

[{'data_identifier': 'MiRA_Resource_Centre_for_Black,_Immigrant_and_Refugee_Women',
  'data_input_text': 'MiRA Resource Centre for Black, Immigrant and Refugee Women',
  'data_input_ids': [101,
   18062,
   7692,
   2803,
   2005,
   2304,
   1010,
   11560,
   1998,
   13141,
   2308,
   102],
  'data_token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'data_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'lbl2data_idx': [194516, 242040, 333379, 334112, 494210],
  'lbl2data_identifier': ['Category:Feminist_organizations',
   'Category:Human_rights_organizations',
   'Category:Organisations_based_in_Norway',
   'Category:Organizations_established_in_1989',
   "Category:Women\\'s_organizations"],
  'lbl2data_input_text': ['Feminist organizations',
   'Human rights organizations',
   'Organisations based in Norway',
   'Organizations established in 1989',
   'women organizations'],
  'lbl2data_input_ids': [[101, 10469, 4411, 102],
   [101, 2529, 2916, 4411, 102],
   [101, 8

##### `AmazonProduct`

In [None]:
block = XCBlock.from_cfg(data_dir, 'train', dset='amazontitles', tfm='ng', tokenizer='bert-base-uncased')

In [None]:
b = block.train.one_batch(); b.keys()

dict_keys(['lbl2data_idx', 'plbl2data_idx', 'plbl2data_data2ptr', 'lbl2data_identifier', 'lbl2data_input_text', 'lbl2data_input_ids', 'lbl2data_token_type_ids', 'lbl2data_attention_mask', 'lbl2data_data2ptr', 'data_identifier', 'data_input_text', 'data_input_ids', 'data_token_type_ids', 'data_attention_mask'])

In [None]:
import torch
for k,v in b.items():
    if isinstance(v, torch.Tensor): print(k,':', v.shape)
    else: print(k,':',len(v))

lbl2data_idx : torch.Size([10])
plbl2data_idx : torch.Size([188])
plbl2data_data2ptr : torch.Size([10])
lbl2data_identifier : 10
lbl2data_input_text : 10
lbl2data_input_ids : torch.Size([10, 16])
lbl2data_token_type_ids : torch.Size([10, 16])
lbl2data_attention_mask : torch.Size([10, 16])
lbl2data_data2ptr : torch.Size([10])
data_identifier : 10
data_input_text : 10
data_input_ids : torch.Size([10, 22])
data_token_type_ids : torch.Size([10, 22])
data_attention_mask : torch.Size([10, 22])


In [None]:
b = block.train.dset.one_batch(); b

[{'data_identifier': 'B008NFLR7O',
  'data_input_text': 'Anna-Kaci S/M Fit White Prairie Inspired Semi-Sheer Long Smocked Dress w Lace',
  'data_input_ids': [101,
   4698,
   1011,
   10556,
   6895,
   1055,
   1013,
   1049,
   4906,
   2317,
   10996,
   4427,
   4100,
   1011,
   11591,
   2146,
   15488,
   7432,
   2098,
   4377,
   1059,
   12922,
   102],
  'data_token_type_ids': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'data_attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  'lbl2data_idx': [517992,
   556202,
   706552,
   751706,
   845087,
   862841,
   869606,
   884751,
   1092734,
   1209460,
   1212962,
   1225372],
  'lbl2data_identifier': ['B0054R2F0W',
   'B007NLXBZA',
   'B009YDTBJ0',
   'B00CF74DZE',
   'B007YVYOGE',
   'B00BQM73HY',
   'B007YVYP3G',
   'B008ZYCXIA'

## Batch

In [None]:
#| export
def prepare_batch(m, b, m_args=None):
    m_kwargs = inspect.signature(m.forward).parameters
    return BatchEncoding({k:v for k,v in b.items() if k in m_kwargs or (m_args is not None and k in m_args)})