This is notebook is used for evaluating the model performance of Llama2 70b-chat using HuggingChat on the de-identification task on the i2b2 medical record data.

#**Preparing data**

In [None]:
from google.colab import drive
import os
import pandas as pd

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
directory = "/content/drive/My Drive/GE Capstone/Data/i2b2/testing-PHI-Gold-fixed"

from bs4 import BeautifulSoup
# process dataset, parse each xml file into a text and a tag dict
list_of_text_contents = []
list_of_files = []
list_of_tags = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        # print(os.path.basename(os.path.normpath(f))[:-4])
        list_of_files.append(os.path.basename(os.path.normpath(f))[:-4])
        with open(f) as fp:
            tag_list = []
            soup = BeautifulSoup(fp, features="xml")
            text = soup.find('TEXT')
            text_content = text.contents[0]
            list_of_text_contents.append(text_content)
            tags = soup.find('TAGS')
            for item in tags.children:
              try:
                newdict = {"text":item['text'], "type":item["TYPE"]}
                tag_list.append(newdict)
              except:
                pass

            list_of_tags.append(tag_list)


In [None]:
# get the indices of the selected 100 files
index = [] #index list
for idx, each in enumerate(list_of_text_contents):
	if len(each) < 2278:
		index.append(idx)
len(index)

100

In [None]:
# generate sample lists
sample_list_of_files = [list_of_files[i]for i in index]
sample_list_of_text_contents = [list_of_text_contents[i]for i in index]
sample_list_of_tags = [list_of_tags[i]for i in index]

len(sample_list_of_text_contents)
len(sample_list_of_tags)

100

In [None]:
record_df = pd.DataFrame(sample_list_of_text_contents, columns=["record"])
record_df = record_df.reset_index().rename(columns = {"index":"file_index"})
record_df

Unnamed: 0,file_index,record
0,0,\n\n\nRecord date: 2086-05-25\n\n ...
1,1,\n\n\nRecord date: 2092-03-26\n\n\n\nReason fo...
2,2,\n\n\nRecord date: 2083-07-20\n\n ...
3,3,\n\n\nRecord date: 2125-07-24\n\nRoutine F/U\n...
4,4,\n\n\nRecord date: 2078-05-31\n\nVASCULAR SURG...
...,...,...
95,95,\n\n\nRecord date: 2096-02-13\n\nSaint John Va...
96,96,\n\n\nRecord date: 2093-04-30\n\n\n\nReason fo...
97,97,\n\n\nRecord date: 2095-08-06\n\nOncology COMP...
98,98,\n\n\nRecord date: 2092-05-24\n\n\n\n\nBAYONNE...


Split record text into sentences of 100 tokens (if needed)

In [None]:
### split_text function
def split_text(text, slice_length):
  import re
  text_tokens = re.split(r'[\s\t\n]+', text)
  text_tokens = [x for x in text_tokens if x != '']
  n = len(text_tokens)
  output_ls = []
  num_pieces = n//slice_length + 1

  start_pointer = 0
  end_pointer = slice_length

  for i in range (num_pieces):
    text_slice = text_tokens[start_pointer:end_pointer]
    output_ls.append(" ".join(text_slice))
    start_pointer = end_pointer
    end_pointer = end_pointer + slice_length
  return output_ls

# Split each row of report into list of sentences in 100 tokens
split_df = record_df["record"].map(lambda row: split_text(row, 30))
split_df = pd.DataFrame(split_df).reset_index()
split_df = split_df[[split_df.columns[-1]]]

# Explode the list of sentences in each row into sentence by row
# split_df_row_sep = split_df.explode(split_df.columns[-1]).reset_index()
# split_df_row_sep = split_df_row_sep[[split_df_row_sep.columns[-1]]]
#split_df_row_sep.to_csv('/content/drive/My Drive/GE Capstone/Data/ASRS data/splitted_report_sentbyrow.csv', index=False)

In [None]:
split_df.loc[0][0]

['Record date: 2086-05-25 FCH EMERGENCY DEPT VISIT TANNER,YUL M. 099-04-65-0 VISIT DATE: 05/25/86 I saw this patient May 25 with the SAR and the attending from the Short-stay Unit. PRESENTING COMPLAINT: Polyuria, fatigue. HISTORY OF PRESENTING COMPLAINT: This 50 year old man was referred in by Dr. Locke covering for Dr. Tirado for polyuria and fatigue. His last recorded fasting blood sugar was 185. He has come week from a trip for one week and he did follow his diet, but had continued increasing polyuria, polydipsia, blurring of vision and weakness. PAST MEDICAL HISTORY: Renal insufficiency and "incipient type II',
 'diabetes" for which he had been following a strict diet. PHYSICAL EXAMINATION: Unremarkable except for a chronic strabismus. LABORATORY EVALUATION: Laboratory studies included a blood glucose of 709, an anion gap of 23 and elevated beta hydroxybutyrate. THERAPY RENDERED/COURSE IN ED: He was begun on intravenous fluids and intravenous insulin, and arrangements were made to 

#**Performance Evaluation**

In [None]:
!pip install tiktoken
import re
import string
import tiktoken
import pandas as pd



In [None]:
# Import the Llama2 response on HuggingChat
# Each file has one column of original text and one column of rewritten text by Llama2
folder = "/content/drive/My Drive/GE Capstone/Data/i2b2/llama2_file_split/"
lc = pd.read_csv(os.path.join(folder, "i2b2_longcong_modified.csv"))
sn = pd.read_csv(os.path.join(folder, "i2b2_shaonan_modified.csv"))
ky = pd.read_csv(os.path.join(folder, "i2b2_kaiyuan_modified.csv"))
jw = pd.read_csv(os.path.join(folder, "i2b2_jiawen_modified.csv"))
sk = pd.read_csv(os.path.join(folder, "i2b2_shukai_modified.csv"))

lc_result = lc
sn_result = sn.rename(columns={"format_all": "format_text"})
ky_result = ky.rename(columns={"formal_text": "format_text"})
jw_result = jw.rename(columns={"format_all": "format_text"})
sk_result = sk

result_df = pd.concat([lc_result, sn_result, ky_result, jw_result, sk_result], ignore_index=True)#[["formal_text"]]

In [None]:
# # combine tuples of 100 tokens based the file it belongs to
# def combine_tuples(df_ls, df_str, start, special=""):
#   if special == "lc":
#     file_idx = [[idx] for idx in range(start, start+20)]
#   else:
#     file_idx = [[idx] for idx in range(start, start+21)]
#   # print(file_idx)

#   tuple_counts = list(df_ls["record"].str.len())
#   # print(tuple_counts)
#   # print(len(tuple_counts))

#   file_idx_all = [l*tuple_counts[l[0]-start] for l in file_idx]

#   final = [item for row in file_idx_all for item in row]

#   if special == "sk":
#     df_str["file_idx"] = final[:-3]
#   else:
#     df_str["file_idx"] = final

#   # drop overlapped files result
#   if special != "lc":
#     df_str = df_str[df_str["file_idx"] > start]

#   df_str = df_str.groupby('file_idx').agg({'formal_text': lambda x: ' '.join(x)}).reset_index()
#   return df_str

In [None]:
result_df

Unnamed: 0,file_index,record,format_text
0,0,\n\n\nRecord date: 2086-05-25\n\n ...,Record date: #################### FCH EMERGEN...
1,1,\n\nRecord date: 2092-03-26\n\n\n\nReason for ...,Record date: ##########\n\nReason for Visit\n\...
2,2,\n\n\nRecord date: 2083-07-20\n\n ...,Record date: ######################\n\nSILVER ...
3,3,\n\n\nRecord date: 2125-07-24\n\nRoutine F/U\n...,Record date: #####################\n\nRoutine ...
4,4,\n\n\nRecord date: 2078-05-31\n\nVASCULAR SURG...,Record date: ##########\n\nVASCULAR SURGERY H&...
...,...,...,...
95,95,\n\n\nRecord date: 2096-02-13\n\nSaint John Va...,Record date: 2096-02-13\n\nSaint John Valley H...
96,96,\n\n\nRecord date: 2093-04-30\n\n\n\nReason fo...,Record date: ##########\n\nReason for Visit\n\...
97,97,\n\n\nRecord date: 2095-08-06\n\nOncology COMP...,Record date: ######################\n\nOncolog...
98,98,\n\n\nRecord date: 2092-05-24\n\n\n\n\nBAYONNE...,Record date: ######################\n\nBAYONNE...


In [None]:
# calculate the number of tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# tokenize rewritten text after de-idenfitication as well as original content
# all tokens are lowercased without any punctuations
def tokenize_text_generation(rewritten_text, text_content):
  rewritten_text = "".join(["" if ch in string.punctuation else ch for ch in rewritten_text.lower()])
  rewritten_text_tokens = re.split(r'[\s\t\n]+', rewritten_text)
  rewritten_text_tokens = [x for x in rewritten_text_tokens if x != '']

  text_content = "".join(["" if ch in string.punctuation else ch for ch in text_content.lower()])
  original_tokens = re.split(r'[\s\t\n]+', text_content)
  original_tokens = [x for x in original_tokens if x != '']
  return rewritten_text_tokens, original_tokens

# this function is used as the evaluation metrics
def evaluation_metrics(original_tokens, tag_list, rewritten_tokens):
  # get all the tag text
  tags = [x["text"] for x in tag_list]

  # tokenize tags
  # all tags are lowercased without any punctuations
  tags = [re.split(r'[\s\t\n]+', x) for x in tags]
  tags = [item for sublist in tags for item in sublist]
  tags = ["".join(["" if ch in string.punctuation else ch for ch in x.lower()]) for x in tags if x != '']

  # get unmasked tokens, which should not be masked
  unmasked_tokens = [x for x in original_tokens if x not in tags]

  FN = 0
  FP = 0
  TP = 0
  TN = 0

  # We calculate FP by counting tokens that shouldn't be masked not appearing in the rewritten text
  # However, llama2 may rephrase original sentences using different words, so it's hard to differentiate if token is indeed de-identified or rephrased
  for i in range (len(unmasked_tokens)):
    if unmasked_tokens[i] in rewritten_tokens:
      TN += 1
    else:
      FP +=1

  for i in range (len(tags)):
    if tags[i] not in rewritten_tokens:
      TP += 1
    else:
      FN += 1
      print(tags[i])
  assert TP+FN == len(tags)

  # # too many TN
  # if TP+TN+FN > len(original_tokens):
  #   print(">>>>")
  # FP = len(original_tokens) - TP - TN - FN

 # Precision
  try:
      precision = TP / (TP + FP)
  except ZeroDivisionError:
      precision = 0.0

  # Recall
  try:
      recall = TP / (TP + FN)
  except ZeroDivisionError:
      recall = 0.0

  # Accuracy
  try:
      accuracy = (TP + TN) / (TP + TN + FP + FN)
  except ZeroDivisionError:
      accuracy = 0.0

  # F1-score
  try:
      f1 = 2 * (precision * recall) / (precision + recall)
  except ZeroDivisionError:
      f1 = 0.0

  return {"TP": TP, "TN": TN, "FP": FP, "FN": FN, "accuracy": accuracy, "precision": precision, "recall": recall, "F1-score": f1}

In [None]:
# sampled original text
result_df.iloc[-1,1]

"\n\n\nRecord date: 2066-07-26\n\n\n\n\n\n\n\n\n\n\nCHIEF COMPLAINT:  PER DR VARNEY\n\n\n\nHISTORY OF PRESENT ILLNESS: s/p CABG 2 weeks ago; doing well without Sx CP/SOB;  was depressed last week but now feeling much better that she walking outside home and back on Wellbutrin (inadvertantly stopped)  Here to have chest skin sutures removed\n\n\n\n\n\nPAST MEDICAL HISTORY: Entered By:frye  On:03/13/2066 \n\nEndocrine disorders:  type II, non-insulin dependent diabetes mellitus.\n\n\n\n\n\n\n\n\n\n\n\nREVIEW OF SYSTEMS: Const: no fever, chills, sweats, weight change, or malaise\n\n\n\n\n\nCV: no chest pain, dyspnea on exertion or palpitations\n\nResp: no SOB, no cough, no wheezing\n\nGI: no abdominal pain, no diarrhea, no constipation\n\n\n\nMusc: no back, shoulder, limb pain, no myalgias\n\n\nSkin: no rashes or pruritis\n\n\n\nPsych: not depressed or anxious\n\n\n\n.\n\n\n\n\n\n\n\nEXAMINATION: well appearing, in good spirits\n\n\n\nlungs CTA; no JVD, incision C/D without erythema\n\nRR

In [None]:
# sampled rewritten text
result_df.iloc[-1,-1]

"Record date: ######################\n\nCHIEF COMPLAINT: PER DR VARNEY\n\nHISTORY OF PRESENT ILLNESS: s/p CABG ###################### ago; doing well without Sx CP/SOB; was depressed last week but now feeling much better that she walking outside home and back on Wellbutrin (inadvertently stopped) Here to have chest skin sutures removed\n\nPAST MEDICAL HISTORY: Entered By: ###################### On: ######################\n\nEndocrine disorders: type II, non-insulin dependent diabetes mellitus.\n\nREVIEW OF SYSTEMS: Const: no fever, chills, sweats, weight change, or malaise\n\nCV: no chest pain, dyspnea on exertion or palpitations\n\nResp: no SOB, no cough, no wheezing\n\nGI: no abdominal pain, no diarrhea, no constipation\n\nMusc: no back, shoulder, limb pain, no myalgias\n\nSkin: no rashes or pruritis\n\nPsych: not depressed or anxious\n\nEXAMINATION: well appearing, in good spirits\n\nlungs CTA; no JVD, incision C/D without erythema\n\nRRR nl S1S2 no murmur/rub\n\nno edema\n\n8 sutur

In [None]:
# tokens to be de-identified
sample_list_of_tags[-1]

[{'text': '2066-07-26', 'type': 'DATE'},
 {'text': 'VARNEY', 'type': 'DOCTOR'},
 {'text': 'frye', 'type': 'DOCTOR'},
 {'text': '03/13/2066', 'type': 'DATE'},
 {'text': 'Austin', 'type': 'DOCTOR'},
 {'text': '07/25/2066', 'type': 'DATE'},
 {'text': '04/17/2067', 'type': 'DATE'},
 {'text': 'emmons', 'type': 'DOCTOR'},
 {'text': '07/25/2066', 'type': 'DATE'},
 {'text': 'emmons', 'type': 'DOCTOR'},
 {'text': '05/10/2066', 'type': 'DATE'},
 {'text': '04/15/2068', 'type': 'DATE'},
 {'text': 'rosenberg', 'type': 'DOCTOR'},
 {'text': '11/20/2067', 'type': 'DATE'},
 {'text': 'rosenberg', 'type': 'DOCTOR'},
 {'text': '04/08/2065', 'type': 'DATE'},
 {'text': '11/12/2067', 'type': 'DATE'},
 {'text': 'stephens', 'type': 'DOCTOR'},
 {'text': '04/24/2067', 'type': 'DATE'},
 {'text': 'stephens', 'type': 'DOCTOR'},
 {'text': '03/02/2065', 'type': 'DATE'},
 {'text': '03/30/2074', 'type': 'DATE'},
 {'text': 'rosenberg', 'type': 'DOCTOR'},
 {'text': '03/02/2065', 'type': 'DATE'},
 {'text': '03/30/2074', '

In [None]:
result_dict = {}
for index, row in result_df.iterrows():
    orig_text = result_df.at[index, "record"]
    num_tokens = num_tokens_from_string(orig_text, "gpt2")

    rewritten_text = result_df.at[index, "format_text"]
    tags = sample_list_of_tags[index]

    print("-----------No." + str(index + 1) + "(" + str(num_tokens) + ")-----------\n")

    # tokenize and evaluate
    rewritten_tokens, original_tokens = tokenize_text_generation(rewritten_text, orig_text)
    result = evaluation_metrics(original_tokens, tags, rewritten_tokens)
    print(result)

    # save all the results into result_dict
    result_dict[f"sentence{index}"] = result
  #  print("--------- Finished Processing File {} ---------".format(index))

# save results into a JSON file
import json
with open("llama2.i2b2.result.json", "w") as outfile:
    json.dump(result_dict, outfile, indent=4)

In [None]:
# show the results
TP_ls = []
TN_ls = []
FP_ls = []
FN_ls = []

for item in result_dict.items():
  TP_ls.append(item[1]["TP"])
  TN_ls.append(item[1]["TN"])
  FP_ls.append(item[1]["FP"])
  FN_ls.append(item[1]["FN"])

total_TP = sum(TP_ls)
total_TN = sum(TN_ls)
total_FP = sum(FP_ls)
total_FN = sum(FN_ls)

total_accuracy = (total_TP + total_TN) / (total_TP + total_TN + total_FP + total_FN)
total_precision = total_TP / (total_TP + total_FP)
total_recall = total_TP / (total_TP + total_FN)
total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

print(f"TP:{total_TP}, TN:{total_TN}, FP:{total_FP}, FN:{total_FN}")

print("The accuracy is: {}".format(total_accuracy))
print("The recall is: {}".format(total_recall))
print("The precision is: {}".format(total_precision))
print("The F-1 score is: {}".format(total_f1))

TP:1039, TN:20639, FP:2798, FN:752
The accuracy is: 0.8592833359758998
The recall is: 0.5801228364042434
The precision is: 0.2707844670315351
The F-1 score is: 0.369225302061123
