# Config Gen


Basic notebook to generate and explain Ember's entire configuration file. Change the Join Specification Parameters to operate over a custom dataset.

# Imports & Util Functions

In [83]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict 

In [84]:
def save_config(config_path):
    with open(config_path, 'w') as fp:
        json.dump(config, fp, indent=4)
        
def load_config(config_path):
    with open(config_path) as fp:
        config = json.load(fp)
    return config

# Set base paths for data and ember home, and name config

In [85]:
path_base = '.'
data_base = './data/'
config_name = "demo"

# Init New Config

In [86]:
config = defaultdict(dict) 

# Join Specification Parameters (change me)

These are the only paramters you should change to begin with, before considering any of the lower level ones:

* <b> Data Directory Name </b>: must have train_table_A ("left"), train_table_B ("right"), train_supervision, and optional test_supervision over test_table_A and test_table_B. Must be rooted from path_base
* <b> Join Type </b>: must be "INNER", "LEFT", "RIGHT", or "FULL"
* <b> Join sizes</b>: left size and right size

In [87]:
config['data_dir'] = 'abt-buy'
config['join_type'] = "LEFT" # Currently only supports left/right one-to-many joins.
config['left_size'] = 1
config['right_size'] = 10

## Do not edit!
config['data_path'] = data_base + config['data_dir']

# Encoder Parameters

In [88]:
# Alternative configurations are not currently supported by the public API, 
# but can be added upon request
config['num_encoders'] = 'single'
config['model_type'] = 'distilbert'
config['tokenizer_casing'] = 'uncased'

# Pretraining Parameters

## Data preparing

In [89]:
# only available preparer; no not modify unless adding new preparers
config['preparer'] = 'sentence'
config['new_col_name'] = 'merged_all'
config['ID_left'] = "ltable_id"
config['ID_right'] = "rtable_id"

## Optional Pretraining

In [90]:
config['pretrain_mlm'] = True

# currently only exposes BM25-based MLM
config['mlm_supervision'] = 'BM25' 

config['from_scratch'] = False
config['mlm_train_epochs'] = 20 # Decrease to reduce pretraining time
config['mlm_batch_size'] = 8

config['mlm_probability'] = 0.15
config['mlm_num_seps'] = None 

#ALL, BEFORE, AFTER for conditional masking. Please keep to 'ALL' as others are deprecated
config['mlm_masking'] = 'ALL' 

config['mlm_model_name'] = f"{config['data_dir']}-{config['model_type']}-{config['tokenizer_casing']}-\
masked-{config['mlm_masking']}-{config['mlm_supervision']}"

# Representation Learning Parameters

In [91]:
# combo of num_encoders and pretrain_MLM does this 
# Base encoder type. Options can be:
    ## distilbert-base-cased
    ## bert-base_cased
    ## config['arch'] = 'pretrained'
#config['arch'] = 'pretrained' 
#config['encoder_base']= path_base + f'/pretraining/models/{config['MLM_model_name']}'

config['tokenizer'] = f"{config['model_type']}-base-{config['tokenizer_casing']}"
config['pos_frac'] = 1

config['train_frac'] = 1 # Increase to generate and train with more triplets. Can improve performance
config['epochs'] = 1
config['batch_size'] = 8
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'triplet'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512

config['model_name'] = f"{config['data_dir']}-{config['model_type']}-{config['tokenizer_casing']}-\
{config['num_encoders']}-pretrain-{config['pretrain_mlm']}-pos-frac-{config['pos_frac']}"

# Save and load (to verify) config

In [92]:
config_path = path_base + f"/configs/{config_name}.json"
print('Run the command as follows:')
print(f"python ember.py -c {config_path}")
save_config(config_path)
load_config(config_path)

Run the command as follows:
python ember.py -c ./configs/demo.json


{'data_dir': 'abt-buy',
 'join_type': 'LEFT',
 'left_size': 1,
 'right_size': 10,
 'data_path': './data/abt-buy',
 'num_encoders': 'single',
 'model_type': 'distilbert',
 'tokenizer_casing': 'uncased',
 'preparer': 'sentence',
 'new_col_name': 'merged_all',
 'ID_left': 'ltable_id',
 'ID_right': 'rtable_id',
 'pretrain_mlm': True,
 'mlm_supervision': 'BM25',
 'from_scratch': False,
 'mlm_train_epochs': 20,
 'mlm_batch_size': 8,
 'mlm_probability': 0.15,
 'mlm_num_seps': None,
 'mlm_masking': 'ALL',
 'mlm_model_name': 'abt-buy-distilbert-uncased-masked-ALL-BM25',
 'tokenizer': 'distilbert-base-uncased',
 'pos_frac': 1,
 'train_frac': 1,
 'epochs': 1,
 'batch_size': 8,
 'final_size': 200,
 'lr': 1e-05,
 'loss': 'triplet',
 'tl_margin': 1.0,
 'tl_p': 2,
 'pool_type': 'CLS',
 'tokenizer_max_length': 512,
 'model_name': 'abt-buy-distilbert-uncased-single-pretrain-True-pos-frac-1'}