# 依不同環境安裝套件並掛載資料夾

In [2]:
import os, site, sys
from pathlib import Path

if "google.colab" in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')
  !pip install torch --quiet
  !pip install transformers datasets evaluate peft accelerate sentencepiece --quiet
  !pip install numpy matplotlib tqdm --quiet
  !pip install islab-opendeid --quiet
else:
  %pip install torch --quiet
  %pip install transformers datasets evaluate peft accelerate sentencepiece --quiet
  %pip install numpy matplotlib tqdm --quiet
  %pip install islab-opendeid --quiet
  %pip install ipywidgets nbformat nbclient widgetsnbextension pandas-profiling --quiet

Note: you may need to restart the kernel to use updated packages.
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl.metadata (23 kB)
Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: peft
Successfully installed peft-0.6.2
Note: you may need to restart the kernel to use updated packages.
Collecting matplotlib
  Downloading matplotlib-3.8.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.45.1-cp310-cp310-macosx_10_9_universal2.whl.metadata (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

# 重載套件並切換工作目錄

In [None]:
import os, site, sys
from pathlib import Path

if "google.colab" in sys.modules:
  OUT_DIR_PATH = Path("/content/drive/MyDrive/aicup/out")
  RES_DIR_PATH = Path("/content/drive/MyDrive/aicup/res")
  SRC_DIR_PATH = Path("/content/drive/MyDrive/aicup/src")
else:
  OUT_DIR_PATH = Path(Path.cwd().parent, "out")
  RES_DIR_PATH = Path(Path.cwd().parent, "res")
  SRC_DIR_PATH = Path(Path.cwd().parent, "src")

site.main()
os.chdir(SRC_DIR_PATH)

# 定義 TsvDatasetHelper 以便後續讀取
https://huggingface.co/docs/datasets/main/en/loading#csv

In [None]:
from pathlib import Path
from datasets import load_dataset, Features, Value

class TsvDatasetHelper:
  TsvFeatures = Features(
    {
      "fid": Value("string"),
      "idx": Value("int64"),
      "content": Value("string"),
      "label": Value("string"),
    }
  )
  @staticmethod
  def get_dataset(tsv_path: Path):
    data_files = tsv_path.as_posix()
    print(data_files)
    return load_dataset(
      path="csv",
      delimiter="\t",
      data_files=data_files,
      features=TsvDatasetHelper.TsvFeatures,
      column_names=TsvDatasetHelper.TsvFeatures.keys(),
      keep_default_na=False,
    )


### import package

In [7]:
# import os, re
# from pathlib import Path


# import matplotlib.pyplot as plt
# from tqdm import tqdm, trange

# from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup


# from torch import optim
# from torch.optim import AdamW
# from torch.utils.data import DataLoader
# from torch.nn import functional as F
# from torch.utils.data import Dataset


In [5]:
import random
import torch
import numpy as np

def set_torch_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benckmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_torch_seed()

# 載入 TSV Dataset

In [8]:
tsv_path = Path(OUT_DIR_PATH, "/merged_train_single_line.tsv")
dataset = TsvDatasetHelper.get_dataset(tsv_path)
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# 印出檢視 TSV Dataset

In [None]:

from pprint import pprint

pprint(dataset['train'][161])
pprint(dataset['train'])

# 定義 TokenHelper 方便之後處理

In [None]:
class TokenHelper:
    Bos = "<|endoftext|>"
    Eos = "<|END|>"
    Pad = "<|pad|>"
    Sep = "\n\n####\n\n"
    SpecialTokens = {
        "bos_token": Bos,
        "eos_token": Eos,
        "pad_token": Pad,
        "sep_token": Sep,
    }

    @staticmethod
    def add_token(med_info, phi_info):
        return "{}{}{}{}{}".format(
            TokenHelper.Bos,
            med_info,
            TokenHelper.Sep,
            phi_info,
            TokenHelper.Eos,
        )

### Dataloader Sample

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

plm = "EleutherAI/pythia-70m" #"EleutherAI/pythia-70m-deduped"
tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(TokenHelper.SpecialTokens)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)p3000/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [13]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(dataset['train'])
train_dataloader = DataLoader(
    train_data,
    batch_size=3,
    shuffle=False,
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
)
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([3, 23])


(tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872,    27,  2693,    39,   520, 29195,
            209, 50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
              0, 14311,  4379, 50279,  1267,  1848,  2025,    27, 12965,  4379,
            209, 50277],
         [    0,   416,  1400, 42525, 50276,    53,  1719, 50276,  1235,  2759,
          50279,    36,  7400,    27,    51,  1400, 42525,    61,    79, 19247,
             27,    53,  1719,    61,    79,    59,  3123,    27,  1235,  2759,
            209, 50277]]),
 tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872

### DataLoader For training

In [14]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 8
bucket_train_dataloader = DataLoader(
    train_data,
    batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
    pin_memory=True,
)

In [15]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(
    plm,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
    output_hidden_states=False,
)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [16]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCHS = 10 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=3e-5) # YOU CAN ADJUST LEARNING RATE

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [None]:
from tqdm import tqdm,trange
# 模型儲存資料夾名稱
model_name = "xxx2"
# 模型儲存路徑
model_dir = f"/content/drive/MyDrive/my_aicup/models/{model_name}"
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)
min_loss = 9999

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_Finial.pt'))
    if avg_train_loss < min_loss:
        min_loss = avg_train_loss
        torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Average train loss: 1.597884900431119


Epoch:  10%|█         | 1/10 [05:52<52:50, 352.24s/it]

Average train loss: 1.2429722604832947


Epoch:  20%|██        | 2/10 [11:39<46:33, 349.21s/it]

In [None]:
model.load_state_dict(torch.load(os.path.join(model_dir , 'GPT_best.pt')))
model = model.to(device)

def sample_text(model, tokenizer, text, n_words=20):
    model.eval()
    text = tokenizer.encode(text)
    inputs, past_key_values = torch.tensor([text]).to(device), None

    with torch.no_grad():
        for _ in range(n_words):
            out = model(inputs, past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break

    return tokenizer.decode(text)

text = special_tokens_dict['bos_token'] + "D.O.B:  29/9/2000" + special_tokens_dict['sep_token']
print(sample_text(model, tokenizer, text=text , n_words=20))

<|endoftext|>D.O.B:  29/9/2000

####

DATE: 29/9/2000=>2000-09-29 <|END|>


In [None]:
def process_valid_data(test_txts , out_file):
    with open(out_file , 'w' , encoding = 'utf-8') as fw:
        for txt in test_txts:
            m_report = read_file(txt)
            boundary = 0
            # temp = ''.join(m_report)
            fid = txt.split('/')[-1].replace('.txt' , '')
            for idx,sent in enumerate(m_report):
                if sent.replace(' ' , '').replace('\n' , '').replace('\t' , '') != '':
                    sent = sent.replace('\t' , ' ')
                    fw.write(f"{fid}\t{boundary}\t{sent}\n")
                # else:
                #     print(f"{fid}\t{boundary}\t{sent}\n")
                #     assert 1==2
                boundary += len(sent)

test_phase_path = r'/content/drive/MyDrive/aicup/First_Dataset/Validation_Release'
valid_out_file_path = './valid.tsv'
test_txts = list(map(lambda x:os.path.join(test_phase_path , x) , os.listdir(test_phase_path)))
test_txts = sorted(test_txts)
valid_data = process_valid_data(test_txts , valid_out_file_path)

In [None]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files=valid_out_file_path, delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

In [None]:
train_phi_category = [
    'PATIENT', 'DOCTOR', 'USERNAME',
    'PROFESSION',
    'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
    'AGE',
    'DATE', 'TIME', 'DURATION', 'SET',
    'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDR',
    'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE', 'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM',
]

def get_anno_format(sentence , infos , boundary):
    anno_list = []
    lines = infos.split("\n")
    normalize_keys = ['DATE' , "TIME" , "DURATION" , "SET"]
    phi_dict = {}
    for line in lines:
        parts = line.split(":")
        if parts[0] not in train_phi_category or parts[1] == '':
            continue
        if len(parts) == 2:
            phi_dict[parts[0]] = parts[1].strip()
    for phi_key, phi_value in phi_dict.items():
        normalize_time = None
        if phi_key in normalize_keys:
            if '=>' in phi_value:
                temp_phi_values = phi_value.split('=>')
                phi_value = temp_phi_values[0]
                normalize_time = temp_phi_values[-1]
            else:
                normalize_time = phi_value
        try:
            matches = [(match.start(), match.end()) for match in re.finditer(phi_value, sentence)]
        except:
            continue
        for start, end in matches:
            if start == end:
                continue
            item_dict = {
                        'phi' : phi_key,
                        'st_idx' : start + int(boundary),
                        'ed_idx' : end + int(boundary),
                        'entity' : phi_value,
            }
            if normalize_time is not None:
                item_dict['normalize_time'] = normalize_time
            anno_list.append(item_dict)
    return anno_list

def aicup_predict(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = [template.replace("__CONTENT__", data['content']) for data in input]
    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    #return
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
            if "NULL" in pred:
                continue
            phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
            annotations = get_anno_format(input[idx]['content'] , phi_infos , input[idx]['idx'])

            for annotation in annotations:
                if 'normalize_time' in annotation:
                    outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}\t{annotation["normalize_time"]}')
                else:
                    outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}')
    return outputs

In [None]:
from tqdm.notebook import tqdm
import io
BATCH_SIZE = 32

with open("./answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

  0%|          | 0/2680 [00:00<?, ?it/s]