This is notebook is used for evaluating the model performance of Llama2 70b-chat using HuggingChat on the de-identification transfer learning task on the ASRS aviation safety narrative data.

#**Preparing data**
Replace the anonymized information with real information

In [None]:
### function to add privacy function back

aircraft_registration_code = [
    "LC-61A", "AL-Z0X", "CO-N4H", "MQ-RST", "UZ-1G3", "XW-9TW", "WI-JY2", "LV-JI7", "SX-H7A", "ES-P5A",
    "UL-CCO", "ET-VDD", "Q670FR", "L677AW", "P859YW", "E805KD", "FA-UP4", "DG-ZQY", "IN-RCK", "U038EM",
    "N195QW", "GH-KJT", "TV-XNF", "ZY-QBT", "O630RS", "B119IP", "V789MT", "V297EC", "FX-MAI", "QX-D3N",
    "IB-3L5", "KP-9EO", "OP-9E0", "IC-HN8", "GH-RAT", "TZ-8EB", "BZ-39F", "O020IM", "C605HB", "Y401FW",
    "U606BO", "WU-4V1", "A833PI", "TK-9A6", "AV-VON", "H512YT", "B929JP", "FJ-FNL", "W186HO", "AM-L80",
    "RP-KGB", "U917DB", "S193BX", "Z527QC", "HP-JMR", "N854UH", "MI-LPC", "B750DX", "FM-DLF", "EF-4XO",

]

code_xxx_list = ['C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008', 'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017', 'C018', 'C019', 'C020', 'C021', 'C022', 'C023', 'C024', 'C025', 'C026', 'C027', 'C028', 'C029', 'C030']

code_zzz_list = [
    "ATL", "PEK", "DXB", "LAX", "HND", "ORD", "LHR", "PVG", "CDG", "DFW", "AMS", "FRA", "IST", "CAN", "JFK", "SIN", "HKG",
    "ICN", "DEN", "BKK", "SFO", "KUL", "MAD", "CTU", "MIA", "MUC", "MCO", "BCN", "SEA", "YYZ", "FCO", "LGW", "IAH", "SZX",
    "GRU", "PHX", "SYD", "LAS", "DEL", "CLT", "MNL", "TPE", "MEX", "CGK", "ZRH", "NRT", "MXP", "DME", "BRU", "CPH"
]

def replace_with_random_entities(row):
    aircraft_x = random.choice(aircraft_registration_code)
    aircraft_y = random.choice(aircraft_registration_code)
    while aircraft_x == aircraft_y:
      aircraft_y = random.choice(aircraft_registration_code)
    row = row.replace('Aircraft X', aircraft_x)
    row = row.replace('Aircraft Y', aircraft_y)
    row = row.replace('XXX', random.choice(code_xxx_list))
    row = row.replace('ZZZ', random.choice(code_zzz_list))
    return row


In [None]:
from google.colab import drive
import os
import pandas as pd
import random

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# Import the un-deidentified narrative reports from ASRS data
# Replace the anonymized information with random entities

output_folder_path = "/content/drive/My Drive/GE Capstone/Data/ASRS data/un-deidentified"

folder_path = "/content/drive/My Drive/GE Capstone/Data/ASRS data/"
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        df['Report 1'].fillna('', inplace=True)
        df['Report 2'].fillna('', inplace=True)
        df['total_report']=df['Report 1']+' '+ df['Report 2']
        df['total_report_un-deidentified'] = df['total_report'].apply(replace_with_random_entities)
        output_file_path = os.path.join(output_folder_path, 'un-deidentified_' + filename)
        df.to_csv(output_file_path, index=False)

In [None]:
# Combine separate un-deidentified reports into one dataframe
folder_path = "/content/drive/My Drive/GE Capstone/Data/ASRS data/un-deidentified/"
file_names = os.listdir(folder_path)

all_df = pd.DataFrame()
for i in range(len(file_names)):
  df = pd.read_csv(folder_path+file_names[0]).iloc[1:] #exclude first row "narrativenarrative" in each file
  all_df = pd.concat([all_df, df])
report_df = all_df[["total_report_un-deidentified"]]
report_df

Unnamed: 0,total_report_un-deidentified
1,The weather at TPA on the arrival was 1/4 SM a...
2,On descent into DME setting up for the ILS XX ...
3,A Ramp Agent brought me a battery powered whee...
4,I was hand flying the ILS 20R into SNA and was...
5,W186HO from PHXZ blocked into Gate X at XA:39 ...
...,...
2131,DJI Mavic Air 2s was hovering above empty stre...
2132,Runway marking dangerously worn. First reporte...
2133,EA (Engineering Authorization) doesn't require...
2134,I entered a taxiway without permission. Taxiwa...


Split report text in each narrative into sentences of 100 tokens (if needed)

In [None]:
### split_text function
def split_text(text, slice_length):
  import re
  text_tokens = re.split(r'[\s\t\n]+', text)
  text_tokens = [x for x in text_tokens if x != '']
  n = len(text_tokens)
  output_ls = []
  num_pieces = n//slice_length + 1

  start_pointer = 0
  end_pointer = slice_length

  for i in range (num_pieces):
    text_slice = text_tokens[start_pointer:end_pointer]
    output_ls.append(" ".join(text_slice))
    start_pointer = end_pointer
    end_pointer = end_pointer + slice_length
  return output_ls

# Split each row of report into list of sentences in 100 tokens
split_df = report_df["total_report_un-deidentified"].map(lambda row: split_text(row, 100))
split_df = pd.DataFrame(split_df).reset_index()
split_df = split_df[[split_df.columns[-1]]]
#split_df.to_csv('/content/drive/My Drive/GE Capstone/Data/ASRS data/splitted_report_all.csv', index=False)

# Explode the list of sentences in each row into sentence by row
# split_df_row_sep = split_df.explode(split_df.columns[-1]).reset_index()
# split_df_row_sep = split_df_row_sep[[split_df_row_sep.columns[-1]]]
#split_df_row_sep.to_csv('/content/drive/My Drive/GE Capstone/Data/ASRS data/splitted_report_sentbyrow.csv', index=False)

#**Performance Evaluation**

In [None]:
!pip install tiktoken
import re
import string
import tiktoken
import pandas as pd



In [None]:
# Import the Llama2 response on HuggingChat
# Each file has one column of original text and one column of rewritten text by Llama2

folder = "/content/drive/My Drive/GE Capstone/Data/ASRS data/llama2_file_split/"
lc = pd.read_csv(os.path.join(folder, "asrs_longcong_modified.csv"))
sn = pd.read_csv(os.path.join(folder, "asrs_shaonan_modified.csv"))
ky = pd.read_csv(os.path.join(folder, "asrs_kaiyuan_modified.csv"))
jw = pd.read_csv(os.path.join(folder, "asrs_jiawen_modified.csv"))
sk = pd.read_csv(os.path.join(folder, "asrs_shukai_modified.csv"))

lc_result = lc
sn_result = sn
ky_result = ky.rename(columns={"formal_text": "format_text"})
jw_result = jw.rename(columns={"formal_text": "format_text"})
sk_result = sk

# Combined final results
result_df = pd.concat([lc_result, sn_result, ky_result, jw_result, sk_result], ignore_index=True)

In [None]:
# Sampled original text
result_df.iloc[-2,0]

"After reviewing the dispatch release and weather reports the First Officer (PM (Pilot Monitoring)) and myself Captain (PF (Pilot Flying)) coordinated for a flow time to CTU1. I conserved fuel after push back waiting on flow time of which was moved due to ATC delays into CTU1 and additional 20 minute to XA20z EDT. Upon departure we received vectors NE bound up to 10;000 ft. and shortly after to 11;000 ft. During climb it was IMC and light to moderate constant chop occasional turbulence with light to moderate precipitation. The continuous ignition was used throughout the whole sequence per procedure from weather conditions. My radar was set at +5.5 and 20 miles for departure and lowered around tilt +3.5 at the enroute cruise altitude with scattered returns mostly light to moderate precipitation. The aircraft anti ice cowls were activated as temperature dropped. ATC issued us a new arrival and vectors which I was scanning ahead with radar. About this time moderate to extreme precipitatio

In [None]:
# Sampled rewritten text
result_df.iloc[-2,-1]

"After reviewing the dispatch release and weather reports, the First Officer (PM) and I (Captain/PF) coordinated for a flow time to CTU1. However, due to ATC delays, our flow time was moved to CTU1 and additional 20 minutes to XA20z EDT. Upon departure, we received vectors northeast bound up to 10,000 ft. and shortly after to 11,000 ft. During the climb, it was IMC and light to moderate constant chop with occasional turbulence and light to moderate precipitation. We used continuous ignition throughout the sequence per procedure from weather conditions. My radar was set at +5.5 and 20 miles for departure and lowered around tilt +3.5 at the enroute cruise altitude with scattered returns mostly light to moderate precipitation. The aircraft anti-ice cowls were activated as the temperature dropped.\n\nATC issued us a new arrival and vectors, which I was scanning ahead with radar. About this time, moderate to extreme precipitation was reported to us, and I asked for a vector eastbound. Sudde

In [None]:
# calculate the number of tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# tokenize rewritten text after de-idenfitication as well as original content
# all tokens are lowercased without any punctuations
def tokenize_text_generation(rewritten_text, text_content):
    rewritten_text = "".join(["" if ch in string.punctuation else ch for ch in rewritten_text.lower()])
    rewritten_text_tokens = re.split(r'[\s\t\n]+', rewritten_text)
    rewritten_text_tokens = [x for x in rewritten_text_tokens if x != '']

    text_content = "".join(["" if ch in string.punctuation else ch for ch in text_content.lower()])
    original_tokens = re.split(r'[\s\t\n]+', text_content)
    original_tokens = [x for x in original_tokens if x != '']
    return rewritten_text_tokens, original_tokens

# this function is used as the evaluation metrics
def evaluation_metrics(original_tokens, tag_list, rewritten_tokens):
  # all tags are lowercased without any punctuations
  tag_list = ["".join(["" if ch in string.punctuation else ch for ch in x.lower()]) for x in tag_list if x != '']

  # get tokens to be masked based the tags used for replacement in data preparation part
  # get unmasked tokens, which should not be masked
  tokens_tobe_masked = []; unmasked_tokens = []
  for x in original_tokens:
    if x in tag_list:
      tokens_tobe_masked.append(x)
    else:
      unmasked_tokens.append(x)

  FN = 0
  FP = 0
  TP = 0
  TN = 0

  # We calculate FP by counting tokens that shouldn't be masked not appearing in the rewritten text
  # However, llama2 may rephrase original sentences using different words, so it's hard to differentiate if token is indeed de-identified or rephrased
  for i in range(len(unmasked_tokens)):
    if unmasked_tokens[i] in rewritten_tokens:
      TN += 1
    else:
      FP += 1

  for i in range(len(tokens_tobe_masked)):
    if tokens_tobe_masked[i] not in rewritten_tokens:
      TP += 1
    else:
      FN += 1

 # Precision
  try:
      precision = TP / (TP + FP)
  except ZeroDivisionError:
      precision = 0.0

  # Recall
  try:
      recall = TP / (TP + FN)
  except ZeroDivisionError:
      recall = 0.0

  # Accuracy
  try:
      accuracy = (TP + TN) / (TP + TN + FP + FN)
  except ZeroDivisionError:
      accuracy = 0.0

  # F1-score
  try:
      f1 = 2 * (precision * recall) / (precision + recall)
  except ZeroDivisionError:
      f1 = 0.0

  return {"TP": TP, "TN": TN, "FP": FP, "FN": FN, "accuracy": accuracy, "precision": precision, "recall": recall, "F1-score": f1}

In [None]:
# tags to be de-identified
tag_list = aircraft_registration_code + code_xxx_list + code_zzz_list

result_dict = {}
for index, row in result_df.iterrows():
    orig_text = result_df.at[index, "total_report_un-deidentified"] #" ".join(row[0])
    num_tokens = num_tokens_from_string(orig_text, "gpt2")
    rewritten_text = result_df.at[index, "format_text"]

    print("-----------No." + str(index + 1) + "(" + str(num_tokens) + ")-----------\n")

    # tokenize and evaluate
    rewritten_tokens, original_tokens = tokenize_text_generation(rewritten_text, orig_text)
    result = evaluation_metrics(original_tokens, tag_list, rewritten_tokens)
    print(result)

    # save all the results into result_dict
    result_dict[f"sentence{index}"] = result
  #  print("--------- Finished Processing File {} ---------".format(index))

# save results into a JSON file
import json
with open("llama2.asrs.result.json", "w") as outfile:
    json.dump(result_dict, outfile, indent=4)

In [None]:
# show the results
TP_ls = []
TN_ls = []
FP_ls = []
FN_ls = []

for item in result_dict.items():
  TP_ls.append(item[1]["TP"])
  TN_ls.append(item[1]["TN"])
  FP_ls.append(item[1]["FP"])
  FN_ls.append(item[1]["FN"])

total_TP = sum(TP_ls)
total_TN = sum(TN_ls)
total_FP = sum(FP_ls)
total_FN = sum(FN_ls)

total_accuracy = (total_TP + total_TN) / (total_TP + total_TN + total_FP + total_FN)
total_precision = total_TP / (total_TP + total_FP)
total_recall = total_TP / (total_TP + total_FN)
total_f1 = 2 * (total_precision * total_recall) / (total_precision + total_recall)

print(f"TP:{total_TP}, TN:{total_TN}, FP:{total_FP}, FN:{total_FN}")

print("The accuracy is: {}".format(total_accuracy))
print("The recall is: {}".format(total_recall))
print("The precision is: {}".format(total_precision))
print("The F-1 score is: {}".format(total_f1))

TP:66, TN:13378, FP:467, FN:25
The accuracy is: 0.9646957520091849
The recall is: 0.7252747252747253
The precision is: 0.12382739212007504
The F-1 score is: 0.2115384615384615
