In [None]:
#| default_exp 15_wikipedia-config-file

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
import json, os, argparse
from xcai.config import PARAM

## `Wikipedia` config

In [None]:
#| export
def get_wikipedia_config(data_dir, metadata_type='', x_prefix='', y_prefix='', z_prefix=''):
    if len(metadata_type): metadata_type = f'_{metadata_type}'

    xy_prefix = xyz_prefix = xr_prefix = yr_prefix = zr_prefix = ''
    if len(x_prefix) and len(y_prefix) and len(z_prefix):
        xy_prefix = f'_{x_prefix}-{y_prefix}'
        xyz_prefix = f'_{x_prefix}-{y_prefix}-{z_prefix}'
        
        xr_prefix = f'.{x_prefix}'
        yr_prefix = f'.{x_prefix}-{y_prefix}'
        zr_prefix = f'.{x_prefix}-{y_prefix}-{z_prefix}'

    key = f"data{metadata_type}{xyz_prefix}"
    config = {
        key : {
            "path": {
                "train": {
                    "data_lbl": f"{data_dir}/trn_X_Y{xy_prefix}.npz",
                    "data_info": f"{data_dir}/raw_data/train{xr_prefix}.raw.txt",
                    "lbl_info": f"{data_dir}/raw_data/label{yr_prefix}.raw.txt",
                    "data_lbl_filterer": f"{data_dir}/filter_labels_train{xy_prefix}.txt",
                    "cat_meta": {
                        "prefix": "cat",
                        "data_meta": f"{data_dir}/category_trn_X_Y{xyz_prefix}.npz",
                        "lbl_meta": f"{data_dir}/category_lbl_X_Y{xyz_prefix}.npz",
                        "meta_info": f"{data_dir}/raw_data/category{zr_prefix}.raw.txt"
                    }
                },
                "test": {
                    "data_lbl": f"{data_dir}/tst_X_Y{xy_prefix}.npz",
                    "data_info": f"{data_dir}/raw_data/test{xr_prefix}.raw.txt",
                    "lbl_info": f"{data_dir}/raw_data/label{yr_prefix}.raw.txt",
                    "data_lbl_filterer": f"{data_dir}/filter_labels_test{xy_prefix}.txt",
                    "cat_meta": {
                        "prefix": "cat",
                        "data_meta": f"{data_dir}/category_tst_X_Y{xyz_prefix}.npz",
                        "lbl_meta": f"{data_dir}/category_lbl_X_Y{xyz_prefix}.npz",
                        "meta_info": f"{data_dir}/raw_data/category{zr_prefix}.raw.txt"
                    }
                }
            },
            "parameters": PARAM,
        }
    }
    
    if len(metadata_type) == 0:
        del config[key]['path']['train']['cat_meta']
        del config[key]['path']['test']['cat_meta']

    return config
    

In [None]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, required=True)
    parser.add_argument('--metadata_type', type=str, default='')
    parser.add_argument('--x_prefix', type=str, default='')
    parser.add_argument('--y_prefix', type=str, default='')
    parser.add_argument('--z_prefix', type=str, default='')
    return parser.parse_args()
    

## `__main__`

In [None]:
#| export
if __name__ == '__main__':
    args = parse_args()
    
    config = get_wikipedia_config(args.data_dir, args.metadata_type, args.x_prefix, args.y_prefix, args.z_prefix)
    os.makedirs(f'{args.data_dir}/configs/', exist_ok=True)

    metadata_type = f'_{args.metadata_type}' if len(args.metadata_type) else ''
    prefix = f'_{args.x_prefix}-{args.y_prefix}-{args.z_prefix}' if len(args.x_prefix) and len(args.y_prefix) and len(args.z_prefix) else ''
    with open(f'{args.data_dir}/configs/data{metadata_type}{prefix}.json', 'w') as file:
        json.dump(config, file, indent=4)
        

In [None]:
data_dir = "/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K/"
config = get_wikipedia_config(data_dir, metadata_type='category', x_prefix='old', y_prefix='new', z_prefix='new')

config

{'data_category': {'path': {'train': {'data_lbl': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//trn_X_Y_old-new.npz',
    'data_info': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//raw_data/train.old.raw.txt',
    'lbl_info': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//raw_data/label.old-new.raw.txt',
    'data_lbl_filterer': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//filter_labels_train_old-new.txt',
    'cat_meta': {'prefix': 'cat',
     'data_meta': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//category_trn_X_Y_old-new-new.npz',
     'lbl_meta': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlsoTitles-320K//category_lbl_X_Y_old-new-new.npz',
     'meta_info': '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/20250123-LF-WikiSeeAlso