# Build SFT data of all labels

In [1]:
from pathlib import Path
import sys
import os
os.environ['HF_HUB_CACHE'] = '/next_share/hf_cache/hub/'
import json
from tqdm import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer
import importlib
import numpy as np
import difflib
from collections import defaultdict
import pandas as pd

import context
os.chdir(context.proj_dir)

import cont_gen
import cont_gen.data_process.ood.build_src_tgt
import cont_gen.data_process.ood.build_sft_meta_data
importlib.reload(cont_gen.data_process.ood.build_src_tgt)
importlib.reload(cont_gen.data_process.ood.build_sft_meta_data)
from cont_gen.data_process.ood.build_src_tgt import process, SFT_Builder, SFT_Builder_YesNo, SFT_Builder_YesNo_Natural
from cont_gen.data_process.ood.build_sft_meta_data import CUAD_Basic, MetaSFT_Train_Builder, MetaSFT_Test_Builder
from cont_gen.data_loader.cuad_sft import CUAD_SFT_Cached
from cont_gen.utils import load_jsonl, save_jsonl

In [3]:
def build_train_meta(train_para_data, train_labels, output_dir, neg_clause_ratio=1.0, num_neg_quest = 1):
    """Build and save train meta data."""
    all_df = MetaSFT_Train_Builder.build_pos_neg_samples(
        train_para_data,
        train_labels,
        neg_clause_ratio=neg_clause_ratio,
        num_neg_quest=num_neg_quest)

    Path(output_dir).mkdir(parents = True, exist_ok=True)
    all_df.to_csv(Path(output_dir) / 'train_meta.csv', index = False)

    return all_df

In [5]:
tkn_names = ['flan-t5', 'llama3', 'mistral']

train_labels = list(range(41))

for tkn_name in tkn_names:
    print(f'Handle tokenizer data: {tkn_name}')
    train_para_data = load_jsonl(f'data/cuad_clean/merge_split/paras_{tkn_name}_512.jsonl')
    build_train_meta(train_para_data, train_labels, f'data/cuad_sft/{tkn_name}')

Handle tokenizer data: flan-t5
Handle tokenizer data: llama3
Handle tokenizer data: mistral


In [6]:
# Add template
from ast import literal_eval
def process_sft_tokenizer(tkn_name, output_dir, builder: SFT_Builder, pmt_name):
    """Build meta data for one tokenizer under multiple splits"""
    all_para_data = load_jsonl(f'data/cuad_clean/merge_split/paras_{tkn_name}_512.jsonl')
    builder.set_para_data(all_para_data)

    
    print(f'Process {output_dir}')
    meta_dir = Path(output_dir) / tkn_name # data for one tokenizer
    save_dir = meta_dir / pmt_name
    train_meta = pd.read_csv(meta_dir / 'train_meta.csv', converters={'answers': literal_eval})
    train_data  = process(builder, train_meta)
    save_jsonl(train_data, save_dir / 'train_data.jsonl')

clause_info = pd.read_csv('./data/clause/all_info.csv')
prompt_01 = open('config/prompts/pmt_01.txt', 'r').read()
builder = SFT_Builder(prompt_01, clause_info, None, lambda k: k)

for tkn_name in tkn_names:
    process_sft_tokenizer(tkn_name, 'data/cuad_sft', builder, 'pmt_01')

Process data/cuad_sft


100%|██████████| 26455/26455 [00:00<00:00, 67915.36it/s]


Process data/cuad_sft


100%|██████████| 26309/26309 [00:00<00:00, 68857.12it/s]


Process data/cuad_sft


100%|██████████| 26569/26569 [00:00<00:00, 69741.05it/s]
