In [None]:
#| default_exp 19_map-amazon-meta-from-gpt-generations

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
import pandas as pd, re, numpy as np, os, scipy.sparse as sp
from tqdm.auto import tqdm

In [None]:
#| export
from sugar.core import *

## Helper code

In [None]:
#| export
def extract_text_between_tags(text, tag='Label'):
    pattern = fr"<{tag}>(.*?)</{tag}>"
    match = re.search(pattern, text)
    return match.group(1).strip() if match else ''
    

In [None]:
#| export
def extract_generations(df, tag='Label'):
    generations = []
    for i in range(df.shape[0]):
        text = df['raw_model_response'].iloc[i]
        text = extract_text_between_tags(text, tag=tag)
        generations.append(text)
    title = df['title'].tolist()
    return title, generations
    

In [None]:
#| export
def get_file_key(fname):
    key = re.match(r'[a-z]*([0-9]+).tsv', fname)
    return int(key.group(1))
    

In [None]:
#| export
def collate_generations(data_dir, tag='Label'):
    title, generations = [], []

    for fname in tqdm(sorted(os.listdir(data_dir), key=get_file_key)):
        df = pd.read_table(f'{data_dir}/{fname}')
        df.fillna('', inplace=True)
        t, g = extract_generations(df, tag=tag)
        title.extend(t)
        generations.extend(g)

    return title, generations
    

In [None]:
#| export
def collate_metadata(data_dir, tag, sep=None):
    title, generation = collate_generations(data_dir, tag=tag)
    metadata = [[text] if sep is None else [o.strip(sep) for o in text.split(sep)] for text in generation]
    return title, metadata
    

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'

fname = os.listdir(data_dir)[0]
df = pd.read_table(f'{data_dir}/{fname}')
df.fillna('', inplace=True)

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'
tst_title, tst_entity = collate_generations(data_dir)

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
tst_entity[:10]

['Adult:Spec_Attribute, Ballet:Spec_Attribute, Tutu:Product, Cheetah:Spec_Attribute, Pink:Spec_Attribute',
 "Why:Not_Sure, Don't:Not_Sure, They:Not_Sure, Just:Not_Sure, Quit:Other_Entity, ?:O, DVD:Product, Roundtable:Other_Entity, Discussion:Other_Entity, ::O, What:O, Families:Other_Entity, and:O, Friends:Other_Entity, need:O, to:O, Know:O, About:O, Addiction:Other_Entity, and:O, Recovery:Other_Entity",
 'Girls:Spec_Attribute, Ballet:Product, Tutu:Product, Zebra:Spec_Attribute, Hot:Spec_Attribute, Pink:Spec_Attribute',
 'Adult:Spec_Attribute, Ballet:Spec_Attribute, Tutu:Product, Purple:Spec_Attribute',
 "Hilda:Personal_Name, Boswell's:Personal_Name, Omnibus:Other_Entity, -:O, A:O, Treasury:Other_Entity, of:O, Favorites:Other_Entity",
 'Ballet:Spec_Attribute, Dress-Up:Spec_Attribute, Fairy:Spec_Attribute, Tutu:Product',
 'The:O, Greatest:Spec_Attribute, Book:Product, on:O, "Dispensational:Other_Entity, Truth":Other_Entity, in:O, the:O, World:Other_Entity',
 'Chess:Product, for:O, Young:

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'
tst_title, tst_category = collate_generations(data_dir, tag='Category')

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
tst_title[:10]

['Adult Ballet Tutu Cheetah Pink',
 "Why Don't They Just Quit? DVD Roundtable Discussion: What Families and Friends need to Know About Addiction and Recovery",
 'Girls Ballet Tutu Zebra Hot Pink',
 'Adult Ballet Tutu Purple',
 "Hilda Boswell's Omnibus - A Treasury of Favorites",
 'Ballet Dress-Up Fairy Tutu',
 'The Greatest Book on "Dispensational Truth" in the World',
 'Chess for Young Beginners',
 'Heavenly Highway Hymns: Shaped-Note Hymnal',
 "The Berenstains' B Book (Bright & Early Books)"]

## Visualize generation

In [None]:
import joblib

from xcai.basics import *
from xcai.analysis import *

In [None]:
pkl_file = '/home/scai/phd/aiz218323/scratch/datasets/processed/mogicX/amazontitles_data-meta_distilbert-base-uncased_sxc.joblib'
block = joblib.load(pkl_file)

In [None]:
dset = TextDataset(XCDataset._initialize(block.test.dset))

In [None]:
#| export
def map_generation_to_data(data_dir, gen_title, gen_meta_text, data_type='test', prepend_title=True):
    fname = f'{data_dir}/test.raw.txt' if data_type == 'test' else f'{data_dir}/train.raw.txt'
    ids, text = load_raw_file(fname)
    
    mapping = {k:v for k,v in zip(gen_title, gen_meta_text)}
    if prepend_title: meta_text = [f'{o} :: {mapping[o]}' if o in mapping else o for o in text]
    else: meta_text = [mapping.get(o, '') for o in text]

    return ids, text, meta_text


In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/raw_data/'

In [None]:
tst_ids, tst_text, tst_entity_text = map_generation_to_data(data_dir, tst_title, tst_entity, prepend_title=False)

In [None]:
_, _, tst_category_text = map_generation_to_data(data_dir, tst_title, tst_category, prepend_title=False)

In [None]:
dset.dset.data.data_info['entity_text'] = tst_entity_text
dset.dset.data.data_info['category_text'] = tst_category_text

dset.dset.data.data_info_keys.extend(['entity_text', 'category_text'])

In [None]:
dset.show()

[5m[7m[34mdata_input_text[0m [34m: Integrated Chinese: Level 1, Part 2 Character Workbook (Traditional & Simplified Character) (Chinese Edition) 3rd (third) Edition by Tao-Chung Yao published by Cheng & Tsui (2008) Paperback[0m
[5m[7m[91mdata_entity_text[0m [91m: Integrated:Other_Entity, Chinese:Other_Entity, ::O, Level:Spec_Attribute, 1:Spec_Attribute, ,:O, Part:Spec_Attribute, 2:Spec_Attribute, Character:Spec_Attribute, Workbook:Product, (:O, Traditional:Spec_Attribute, &:O, Simplified:Spec_Attribute, Character:Spec_Attribute, ):O, (:O, Chinese:Other_Entity, Edition:Spec_Attribute, ):O, 3rd:Spec_Attribute, (:O, third:Spec_Attribute, ):O, Edition:Spec_Attribute, by:O, Tao-Chung:Personal_Name, Yao:Personal_Name, published:O, by:O, Cheng:Brand, &:O, Tsui:Brand, (:O, 2008:Spec_Attribute, ):O, Paperback:Spec_Attribute[0m
[5m[7m[36mdata_category_text[0m [36m: Language Learning Book[0m
[5m[7m[96mlbl2data_input_text[0m [96m: ['Integrated Chinese: Textbook Simplified Ch

## Extract and save raw file

In [None]:
#| export
def extract_and_save_generations(generation_dir, data_dir, data_type, tag, save_tag, prepend_title):
    gen_title, gen_text = collate_generations(generation_dir, tag=tag)

    ids, text, meta_text = map_generation_to_data(data_dir, gen_title, gen_text, data_type, prepend_title)
    
    fname = f'{data_dir}/test_{save_tag}.raw.txt' if data_type == 'test' else f'{data_dir}/train_{save_tag}.raw.txt'
    save_raw_file(fname, ids, meta_text) 
    

In [None]:
tag = 'Label'
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/raw_data/'

In [None]:
generation_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'
data_type = 'test'
extract_and_save_generations(generation_dir, data_dir, data_type, tag)

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
generation_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/train-outputs/'
data_type = 'train'
extract_and_save_generations(generation_dir, data_dir, data_type, tag)

  0%|          | 0/113 [00:00<?, ?it/s]

In [None]:
tag = 'Category'

In [None]:
generation_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'
data_type = 'test'
extract_and_save_generations(generation_dir, data_dir, data_type, tag, save_tag='category', prepend_title=True)

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
generation_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/train-outputs/'
data_type = 'train'
extract_and_save_generations(generation_dir, data_dir, data_type, tag, save_tag='category', prepend_title=True)

  0%|          | 0/113 [00:00<?, ?it/s]

## Extract and save metadata

In [None]:
sep = None
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/raw_data/'

In [None]:
meta_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/test-outputs/'
tst_title, tst_metadata = collate_metadata(meta_dir, tag=tag, sep=sep)

  0%|          | 0/49 [00:00<?, ?it/s]

In [None]:
meta_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_generations/train-outputs/'
trn_title, trn_metadata = collate_metadata(meta_dir, tag=tag, sep=sep)

  0%|          | 0/113 [00:00<?, ?it/s]

In [None]:
mapping = dict()
mapping.update({k:v for k,v in zip(trn_title, trn_metadata)})
mapping.update({k:v for k,v in zip(tst_title, tst_metadata)})

In [None]:
vocab, mapping_item2idx = create_vocab_and_item2idx(mapping)

  0%|          | 0/2829653 [00:00<?, ?it/s]

In [None]:
vocab_ids = list(range(len(vocab)))
vocab_text = sorted(vocab, key=lambda x: vocab[x])

In [None]:
trn_ids, trn_text = load_raw_file(f'{data_dir}/train.raw.txt')
tst_ids, tst_text = load_raw_file(f'{data_dir}/test.raw.txt')
lbl_ids, lbl_text = load_raw_file(f'{data_dir}/label.raw.txt')

In [None]:
trn_matrix, trn_ids = get_matrix_from_item2idx(mapping_item2idx, len(vocab), ids=trn_text)

  0%|          | 0/2248619 [00:00<?, ?it/s]

In [None]:
tst_matrix, tst_ids = get_matrix_from_item2idx(mapping_item2idx, len(vocab), ids=tst_text)

  0%|          | 0/970237 [00:00<?, ?it/s]

In [None]:
lbl_matrix, lbl_ids = get_matrix_from_item2idx(mapping_item2idx, len(vocab), ids=lbl_text)

  0%|          | 0/1305265 [00:00<?, ?it/s]

In [None]:
def filter_vocab(vocab_ids, vocab_txt, trn_mat, tst_mat, lbl_mat=None):
    valid_idx = np.where(trn_mat.getnnz(axis=0) > 0)[0]
    if lbl_mat is not None:
        lbl_idx = np.where(lbl_mat.getnnz(axis=0) > 0)[0]
        valid_idx = np.union1d(valid_idx, lbl_idx)

    trn_mat = trn_mat[:, valid_idx].copy()
    tst_mat = tst_mat[:, valid_idx].copy()
    if lbl_mat is not None: 
        lbl_mat = lbl_mat[:, valid_idx].copy()
    
    vocab_ids, vocab_txt = [vocab_ids[i] for i in valid_idx], [vocab_txt[i] for i in valid_idx]

    return vocab_ids, vocab_txt, trn_mat, tst_mat, lbl_mat
    

In [None]:
vocab_ids, vocab_txt, trn_mat, tst_mat, lbl_mat = filter_vocab(vocab_ids, vocab_text, trn_matrix, tst_matrix, lbl_matrix)

In [None]:
def save_metadata(save_dir, trn_mat, tst_mat, lbl_mat, metadata_ids, metadata_txt, key, model):
    sp.save_npz(f'{save_dir}/{key}_{model}_trn_X_Y.npz', trn_mat)
    sp.save_npz(f'{save_dir}/{key}_{model}_tst_X_Y.npz', tst_mat)
    sp.save_npz(f'{save_dir}/{key}_{model}_lbl_X_Y.npz', lbl_mat)

    save_raw_file(f'{save_dir}/raw_data/{key}_{model}.raw.csv', metadata_ids, metadata_txt)
    

In [None]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M/'

In [None]:
save_entities(data_dir, trn_mat, tst_mat, lbl_mat, vocab_ids, vocab_txt, 'category', 'gpt')

## `__main__`

In [None]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--generation_dir', type=str, required=True)
    parser.add_argument('--data_dir', type=str, required=True)
    parser.add_argument('--tag', type=str, default='Label')
    parser.add_argument('--data_type', type=str, default=None)
    parser.add_argument('--save_tag', type=str, default='entity')
    return parser.parse_args()
    

In [None]:
#| export
if __name__ == '__main__':
    start_time = timer()

    args = parse_args()
    extract_and_save_generations(args.generation_dir, args.data_dir, args.data_type, args.tag, args.save_tag)
    
    end_time = timer()
    print(f'Time elapsed: {end_time-start_time:.2f} seconds.')
    