# process origin dataset

## load origin data

In [None]:
import datasets

In [None]:
dataset = datasets.load_dataset('lexlms/legal_lama', 'cjeu_terms')

dataset = dataset["test"]

dataset

# process new dataset

## process text

In [None]:
dataset = dataset.map(lambda example: {"text": example["text"].strip()}, num_proc=32)

dataset

## process mask

In [None]:
dataset = dataset.map(lambda example: {"text": example["text"].replace("<mask>", example["label"])}, num_proc=32)

dataset

## process other column

In [None]:
dataset = dataset.remove_columns(["category"])

dataset

In [None]:
dataset = dataset.rename_column("label", "legal_term")

dataset

In [None]:
dataset = dataset.map(lambda example: {"legal_term": example["legal_term"].lower()}, num_proc=32)

dataset

## creaet candidate list

In [None]:
import random
from collections import Counter

seed = 2023
random.seed(seed)
counter = Counter(dataset["legal_term"])

candidate_num = 10

def create_candidate(example):
    target = example["legal_term"]
    tmp_list = list(counter.keys())
    tmp_list.remove(target)
    candidate_list = [target] + random.sample(tmp_list, candidate_num - 1)

    return {"legal_term_candidate_list": candidate_list}

dataset = dataset.map(create_candidate, num_proc=32)

dataset

In [9]:
dataset.save_to_disk("./tmp/cjeu_terms")

                                                                                              

## get embedding & save

### sup-simcse-bert-base-uncased

In [None]:
!torchrun --nproc_per_node=8 ../embedding/sup-simcse-bert-base-uncased.py \
    --input_dataset "./tmp/cjeu_terms" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 2127 \

### e5-large-v2

In [None]:
!torchrun --nproc_per_node=8 ../embedding/e5-large-v2.py \
    --input_dataset "./tmp/cjeu_terms" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 2127 \

### bge-large-en

In [None]:
!torchrun --nproc_per_node=8 ../embedding/bge-large-en.py \
    --input_dataset "./tmp/cjeu_terms" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 2127 \