In [1]:
from transformers import AutoTokenizer
import pandas as pd
import os

from src.prompts import system_instruction, basic_prompt

def create_dir_if_not_exist(path):
    isExist = os.path.exists(path)
    if not isExist:
        # Create a new directory because it does not exist
        os.makedirs(path)
        print("{} directory is created!".format(path))
    else:
        print("{} alrady exists.".format(path))

def count_tokens_and_sort(df, tokenizer):
    size_tokens = []
    for _, row in df.iterrows():
        input_ids = tokenizer(row["text"], return_tensors='pt').input_ids.cuda()
        start_index = input_ids.shape[-1]
        size_tokens.append(start_index)

    df["size_of_tokens"] = size_tokens
    df_sorted = df.sort_values(by="size_of_tokens", ascending=False)
    return df_sorted

In [2]:
# indicate the dir for data (TCGA pathology reports)
data_base_dir = "/secure/shared_data/tcga_path_reports/"

model_name_or_path = "m42-health/med42-70b"
cache_dir = "/secure/chiahsuan/hf_cache"
# load tokenizer for counting the number of tokens for each report 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,cache_dir=cache_dir)

In [3]:
# check if the required directories are there for processed dataset and results
create_dir_if_not_exist("m01_test_dir/greedy/")
create_dir_if_not_exist("t14_test_dir/greedy/")
create_dir_if_not_exist("n03_test_dir/greedy/")

m01_test_dir/greedy/ alrady exists.
t14_test_dir/greedy/ alrady exists.
n03_test_dir/greedy/ alrady exists.


# T

In [4]:
t14_df = pd.read_csv(data_base_dir+"t14_data/Target_Data_T14_test.csv")
sorted_t14_df = count_tokens_and_sort(t14_df, tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
print(sorted_t14_df.shape)
t14_test_4096_all = sorted_t14_df[sorted_t14_df["size_of_tokens"] < 4096]
print(t14_test_4096_all.shape)
t14_test_4096_all

(1034, 4)
(1016, 4)


Unnamed: 0,patient_filename,t,text,size_of_tokens
15,TCGA-97-7941.f95131f4-d721-441f-86ee-d9981befc2a7,0,SPECIMENS: 1. F/S LEFT UPPER LOBE NODULE. 2. S...,3963
563,TCGA-RS-A6TO.6F3960E4-4E90-4CFD-B318-61C96F3F5281,3,Clinical Diagnosis & History: Malignant lesion...,3919
350,TCGA-CN-4734.8fc796f0-16c1-4a0a-a686-d2802dc77c83,1,PATIENT HISTORY: CHIEF COMPLAINT/ PRE-OP/ POST...,3861
674,TCGA-CN-6019.6f28e868-5622-42e2-ba7a-80380e364ebb,3,Report Type. Date of Event. Authored by. Hosp/...,3809
7,TCGA-E2-A572.5CED2FBF-42DB-4D17-95C0-DEAE27BBFD02,1,SPECIMEN(S): A. LEFT BREAST AND AXILLARY CONTE...,3649
...,...,...,...,...
486,TCGA-VS-A9U7.3FB4D6F8-6625-49A7-AA2C-533FB9EC9310,0,"PRIMARY SITE: Cervix. ""Uterine cervix biopsy"":...",53
891,TCGA-VS-A9UD.D0920498-DCB8-454A-A294-204A280D4129,2,"PRIMARY SITE: Cervix. 1-""Fragment of uterine c...",49
548,TCGA-VS-A959.10A6CADD-C99B-4A34-9D5D-02AD37C1AB80,1,"PRIMARY SITE: Cervix. ""Uterine cervix biopsy"":...",35
577,TCGA-VS-A8EC.5D347DA8-4A0A-4E09-BCF3-DDD115D741C6,2,"PRIMARY SITE: Cervix. ""Biopsy of the cervix"": ...",33


In [6]:
t14_test_4096_all.to_csv("./t14_test_dir/t14_4096_all_test.csv")

# N

In [7]:
n03_df = pd.read_csv(data_base_dir+"n03_data/Target_Data_N03_test.csv")
sortedn_n03_df = count_tokens_and_sort(n03_df, tokenizer)

In [8]:
print(n03_df.shape)
n03_test_4096_all = sortedn_n03_df[sortedn_n03_df["size_of_tokens"] < 4096]
print(n03_test_4096_all.shape)
n03_test_4096_all

(852, 4)
(830, 4)


Unnamed: 0,patient_filename,n,text,size_of_tokens
52,TCGA-A1-A0SB.BEC7EEA9-6DEA-4402-A3A1-07EF33D58174,0,Final Pathologic Diagnosis: A. Sentinel lymph ...,4091
153,TCGA-JY-A938.572009C2-F1D9-42A0-8489-4066729BA0E2,0,M. Facility: Specimen(s) Received. 1. Lymph-No...,4058
263,TCGA-A2-A4S1.041580F0-700A-4A47-83A6-207ED267E844,0,Rég. Att. M.D. This report contains correction...,4047
599,TCGA-97-A4M6.E697A53A-C377-4BC8-8D3F-79562DFF6F0A,0,"SPECIMENS: 1. F/S LUNG, RIGHT LOWER LOBE WEDGE...",4020
129,TCGA-W6-AA0S.2F3CE1B3-0DFD-4729-8A52-5E0BDE92BD12,0,Surgical Pathology Specimen Source (enter 1 pe...,3827
...,...,...,...,...
105,TCGA-2G-AAFE.414A04CE-E608-48CB-86B6-BF901F888D28,1,Summary pathology report. Left orchidectomy; s...,62
551,TCGA-VS-A9UB.73D61A64-9435-4E72-898C-3A4E7C109CF3,0,PRIMARY SITE: Cervix. Biopsy of uterine cervix...,52
836,TCGA-VS-A952.B7893751-3A8F-4FD8-996F-D8C93C108336,0,PRIMARY SITE: Cervix. Biopsy of uterine cervix...,49
98,TCGA-VS-A9V2.0D5FE178-46D8-41A5-BA54-E9746F3D2B4C,0,"PRIMARY SITE: Cervix. 1- ""Biopsy of cervix"": M...",46


In [9]:
n03_test_4096_all.to_csv("./n03_test_dir/n03_4096_all_test.csv")

# M

In [10]:
m01_df = pd.read_csv(data_base_dir+"m01_data/Target_Data_M01_test.csv")
sortedn_m01_df = count_tokens_and_sort(m01_df, tokenizer)

In [11]:
print(m01_df.shape)
m01_test_4096_all = sortedn_m01_df[sortedn_m01_df["size_of_tokens"] < 4096]
print(m01_test_4096_all.shape)
m01_test_4096_all

(692, 4)
(682, 4)


Unnamed: 0,patient_filename,m,text,size_of_tokens
555,TCGA-A1-A0SB.BEC7EEA9-6DEA-4402-A3A1-07EF33D58174,0,Final Pathologic Diagnosis: A. Sentinel lymph ...,4091
352,TCGA-XF-A8HH.33C8B2EA-980D-4CDC-B8A9-B8583E04BCCA,0,SURGICAL PATHOLOGY RE PORT. FINAL DIAGNOSIS: R...,4060
506,TCGA-JY-A938.572009C2-F1D9-42A0-8489-4066729BA0E2,0,M. Facility: Specimen(s) Received. 1. Lymph-No...,4058
40,TCGA-RS-A6TO.6F3960E4-4E90-4CFD-B318-61C96F3F5281,0,Clinical Diagnosis & History: Malignant lesion...,3919
63,TCGA-GU-A42P.EBD243F6-1508-4BDD-A2D4-58E23D812DF0,0,BIOPSY OR S. Patient Information. Provider Sta...,3852
...,...,...,...,...
249,TCGA-A8-A097.5B867CB6-F94D-435D-83E3-0ADB814B8077,0,Diagnosis: 1. Poorly differentiated invasive d...,67
291,TCGA-E8-A419.96AD2BE9-B8DF-4696-BD0F-61FA2CA60871,0,Laterality: Path Report:Tumor size (in cm):2.5...,63
434,TCGA-2G-AAF6.5892E026-307B-4A38-972F-92DF6ACF0A90,0,Summary pathology report. Right orchidectomy; ...,62
391,TCGA-A8-A09G.2728DB14-66D5-4E67-B9B6-63D93DBF509C,0,Diagnosis: ) Poorly differentiated invasive ca...,53


In [12]:
m01_test_4096_all.to_csv("./m01_test_dir/m01_4096_all_test.csv")

# Unused Area 
the following cells are for testing only.

In [13]:
MED42_PROMPT_TEMPLATE = """
<|system|>:{system_instruction}
<|prompter|>:{prompt}
<|assistant|>:
"""

selc_text = m01_test_4096_all.iloc[0]["text"]
filled_prompt = basic_prompt.format(report=selc_text)
test_txt = MED42_PROMPT_TEMPLATE.format(system_instruction=system_instruction, prompt=filled_prompt)

input_ids = tokenizer(test_txt, return_tensors='pt').input_ids.cuda()
start_index = input_ids.shape[-1]

In [14]:
test_txt

'\n<|system|>:You are an expert at interpreting pathology reports for cancer staging.\n<|prompter|>:You are provided with a pathology report for a cancer patient. \nPlease review this report and determine the pathologic stage of the patient\'s cancer. \n\nHere is the report:\n```\nFinal Pathologic Diagnosis: A. Sentinel lymph node #1, biopsy: No carcinoma in one lymph node (0/1); see. comment. B. Sentinel lymph node #2, biopsy: No carcinoma in one lymph node (0/1); see. comment. C. Breast, left, wire-guided partial mastectomy: 1. Adenoid cystic carcinoma, SBR Grade 1, 1.2 cm; see comment. 2. Microcalcifications involving benign ducts. 3. Atypical ductal hyperplasia. 4. Apocrine metaplasia. 5. Blopsy site changes. 6. Fibroadenoma. D. Breast, right, mammoplasty: 1. Intraductal papilloma. 2. Sclerosing adenosis. 3. Apocrine metaplasia. 4. Microcalcifications involving benign glands. 5. Microcysts. Page of 6. Working Draft. 6. Skin with no significant pathologic abnormality. 7. No carcinom

In [15]:
start_index

4194