In [2]:
# Load packages for data analysis
import pandas as pd
import numpy as np

# Load packages for Big Query 
from google.cloud import bigquery
import os

### Set-up

**Set-up: GCP interface** 

Run before querying to setup Big Query

In [3]:
# Define configurations for Big Query - Stride Datalake
project_id = 'som-nero-phi-boussard' # Location of stride datalake
db = "som-nero-phi-boussard.stride_datalake" # Define the database
msc = "som-nero-phi-boussard.MSc_ACU_Oncology"
es = "som-nero-phi-boussard.ES_ACU_Oncology"
client = bigquery.Client(project=project_id) # Set project to project_id
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/path/to/your/credentials.json'
os.environ['GCLOUD_PROJECT'] = "som-nero-phi-boussard" # specify environment
db = "som-nero-phi-boussard" # Define the database



In [4]:
# 1) Specify the job config to properly read the file
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True # determines the datatype of the variable
job_config.write_disposition = 'WRITE_TRUNCATE'
job_config.max_bad_records = 1 # allow 5 bad records; 

## Vocabulary Development

### 1) Seed terms used to define DX_SHORT

In [5]:
diagnoses = {}
# Class_1 = Pain
diagnoses["class_1"] = ["PAIN","HEADACH","ALGIA","DYSURIA","DYNIA","ACHE","PRESSURE","DISCOMFORT","CRAMP", "TIGHTNESS","RADICULOPATHY","TENDERNES", "ABDOMINAL CRIS"]
# Class_2 = Nausea, Vomitting, Dehydration, diarrhea
diagnoses["class_2"] =  ["DEHY","HYPERNAT", "DIAR", "NAUS", "EMES","VOMIT","GASTROENTER","GASTRIT"]
# Class_3 = Anemia 
diagnoses["class_3"] = ["ANEM","RED CELL"]
# Class_4 = Sepsis, nesutropenia, fever, pneuomia 
diagnoses["class_4"] =  ["FEVER","FEBRILE","HYPERTHERM","PYREX", "NEUT","PANCY","AGRAN", "SEPS","INFLAM","CANDIDEMIA","SEPTIC", "PNEUMO","CONSOLID"]

In [6]:
df = pd.DataFrame(columns=["NAME", "DIAGNOSE"])
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df["TYPE"] = "seed"
df.shape

(36, 3)

In [7]:
len(df[df["DIAGNOSE"] == "class_1"]), len(df[df["DIAGNOSE"] == "class_2"]), len(df[df["DIAGNOSE"] == "class_3"]), len(df[df["DIAGNOSE"] == "class_4"])

(13, 8, 2, 13)

### 2) Clinician suggestions

In [8]:
diagnoses = {}
# Class 1
diagnoses["class_1"] = ["pain", "painful", "noxious", "excruciating", "agonizing", "pleurisy", "neuropathy",
                    "dyspareunia", "low back pain", "hyperasthesia", "migraine", "throbbing", "aching", "pounding", "splitting", "knife-like", "stabbing",
                    "sharp", "burning", "dull", "shooting", "hurts", "localize", "tender", "radiating", "radiates", "radiat"]
# radiat for radiating and radiates 

# Class 2
diagnoses["class_2"] = ["hyponatremic", "hyponatremia", "enteritis", "bowel movement", "stool", "stooling", "vomit", "vomitted", "vomitus", "nauseated", "puke", "puking", "puked", 
                       "loose", "thirsty", "thirst", "dry lips", "cracked lips", "non-bloody, non-bilious", "gastro", "gastroeneritis", "bloody", "watery", "loose", "mucousy", "tenesmus", "loose stools", "light headed", "passsed out", "loss of consciousness", "lose consciousness"]

# Class 3: 

diagnoses["class_3"] = ['red blood cell', 'iron deficiency', 'macrocytic', 'microcytic', 'normocytic', 'myelodysplastic', 'marrow', 'light headed', 'weak', 'fatigue', 'get out of bed', 'pale', 'dyspnea']

# Class 4: for all diagnoses in the class
diagnoses["class_4"] = ['low count', 'cough', 'fever', 'dyspnea', 'shortness of breath', 'productive cough', 'chest X-ray', 'infiltrate', 'consolidation', 'streaking', 'hypoxia', 'oxygen', 'hypoxemia', 'low-grade', 'high-grade', 'oral', 'rectal', 'core', 'thermometer', 'elevated', 'body temperature', 'pyrexia', 'pyrexia of unknown origin', 'fever of unknown origin', 'systemic inflammatory response', 
                      'quick systematic organ failure assessment', 'systemic organ failure assessment', 'antibiotics', 'febrile', 'rigor', 'chills', 'shaking', 'septic', 'shock', 'vasopressors', 'pressors', 'systolic blood pressure', 'blood pressure', 'mean arterial pressure', 'presumed source', 'urosepsis', 'lower respiratory tract infection', 'white blood cell','low white count', 'neupogen', 'neulasta'] 



In [9]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(147, 3)

In [10]:
df.fillna({"TYPE": "clinician"}, inplace=True)

In [11]:
len(df[df["DIAGNOSE"] == "class_1"]), len(df[df["DIAGNOSE"] == "class_2"]), len(df[df["DIAGNOSE"] == "class_3"]), len(df[df["DIAGNOSE"] == "class_4"])

(39, 37, 15, 56)

### 3) Add SNOMED associated terms from UMLS browser

In [12]:
diagnoses = {}
# UMLS terms for class_1: 
diagnoses["class_1"] = pd.read_csv(f"../vocabulary/pain.csv")["NAME"].str.upper()
# UMLS terms for class_2: 
class_2 = []
class_2.extend(pd.read_csv(f"../vocabulary/nausea.csv")["NAME"].str.upper())
class_2.extend(pd.read_csv(f"../vocabulary/dehydration.csv")["NAME"].str.upper())
class_2.extend(pd.read_csv(f"../vocabulary/diarrhea.csv")["NAME"].str.upper())
class_2.extend(pd.read_csv(f"../vocabulary/emesis.csv")["NAME"].str.upper())
class_2.extend(pd.read_csv(f"../vocabulary/vomiting.csv")["NAME"].str.upper())
diagnoses["class_2"] = class_2
# UMLS terms for class 3: 
diagnoses["class_3"] = pd.read_csv(f"../vocabulary/anemia.csv")["NAME"].str.upper()
# UMLS terms for class 4:
class_4 = []
class_4.extend(pd.read_csv(f"../vocabulary/neutropenia.csv")["NAME"].str.upper())
class_4.extend(pd.read_csv(f"../vocabulary/fever.csv")["NAME"].str.upper())
class_4.extend(pd.read_csv(f"../vocabulary/pneumonia.csv")["NAME"].str.upper())
class_4.extend(pd.read_csv(f"../vocabulary/sepsis.csv")["NAME"].str.upper())
diagnoses["class_4"] = class_4

In [13]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(3033, 3)

In [14]:
df.fillna({"TYPE": "snomedct_us"}, inplace=True)

In [15]:
len(df[df["DIAGNOSE"] == "class_1"]), len(df[df["DIAGNOSE"] == "class_2"]), len(df[df["DIAGNOSE"] == "class_3"]), len(df[df["DIAGNOSE"] == "class_4"])

(1084, 236, 437, 1276)

## 4) Add Abbreviation

How: from the notes and internet and from clinicians survey 

Ressources:

- https://madisonmemorial.org/wp-content/uploads/Abbreviation-List-for-Medical-Record-Documentation-V20.pdf

- https://en.wikipedia.org/wiki/List_of_medical_abbreviations:_P

- https://reader.elsevier.com/reader/sd/pii/S1386505611002383?token=8417991568AECEDEC869D00482E710CF5EAF5B89D5B35C65560A1EAFA86F79ED086E60256632DCFE530401F31F933EC3&originRegion=us-east-1&originCreation=20230302215156 (appendix B)

Checker si l'espace devant and apres bonne idee -> car pour moi possible quil y ait que un espace avant or apres. Regarder comment checker les notes better que avec is in (faire comme oualid avec le regexp)

In [16]:
diagnoses = {}

In [17]:
# Abbreviations for class 1:
diagnoses["class_1"] = [' HA ', 'H/A', 'D.O.E.', ' CP ', ' LBP ']

# HA = headache
# N/T: numbness and tingling 
# LBP: low back pain

In [18]:
# Abbreviation for class 2:
diagnoses["class_2"] = [' GI ', ' CINV ','N/V', 'N/V/D' ' NBNB ', ' GERD ', ' BM ', ' NVD ', ' NVDC ', 'N&V', 'N&V&D',  ' ABD ', ' PONV ', ' RINV ', ' NV ', ' IBS ', 'NB/NB', ' NB ', ' AGE ', ' CID', ' CRD ', ' AAD ', ' WBM ', 'D&D', 'D&V']

# NVD = nausea, vomiting, diarrhea
# MVDC = nausea, vomiting, diarrhea, constipation
# N&V = nausea and vomiting 
# AAD = antibiotic-associate diarrhea
# A verifier mais pour moi les biobert doivent comprendre ces abbreviations
# NB/NB (non-bloody, non-bilious)
# NB 
# AGE: Acute gastroenteritis 



In [19]:
# Abbreviation for class 3:
diagnoses["class_3"] = [' RBC ', ' CBC',' EBL ', ' HBP ', ' FE ', ' AIHA ', ' SHOB ', ' ABLA ', ' AHA ', 'get OOB']

# AIHA: autoimmune hemolytic anemia
# SS: hemoglobin 
# ABLA: Acute Blood Loss Anemia

In [20]:
# Abbreviation for class 4:
diagnoses["class_4"] = [' FUO ', 'F/C', 'F&C', ' ANC ', ' AIN ', ' FN ', ' CIN ','Nφ', ' NE ', ' WBC ', ' CBC', ' CXR ', ' FUO ', ' PUO ', ' qSOFA ', ' BP ', ' SOFA ', ' SIRS ', ' SBP ', ' MAP ', ' LRTI ', ' INF ', ' SHOB ', ' PNA ', ' PCP ', ' PSI ']

# ANC: A measure of the number of neutrophils in the blood
# CN = cyclic neutropenia
# AIN : Alloimmune neutroenia 
# FN febrile neutropenia 
# CIN Chemotherapy induced neutropenia 
# PCP : Pneumocystis pneumonia
# PNA = Pneumonia
# PSI = Pneumonia severity index
# Nφ = neutrophil
# NE = neutropil granulocytes
# SHOB = shortness of breath 

In [21]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(3097, 3)

In [22]:
df.fillna({"TYPE": "abbreviation"}, inplace=True)

In [23]:
len(df[df["DIAGNOSE"] == "class_1"]), len(df[df["DIAGNOSE"] == "class_2"]), len(df[df["DIAGNOSE"] == "class_3"]), len(df[df["DIAGNOSE"] == "class_4"])

(1089, 260, 447, 1301)

### 5) Extend vocabulary with synonym, symptoms, definitions

**a) Use synonyms online tools**

In [24]:
diagnoses = {}

In [25]:
# Add synonyms for class 1
diagnoses["class_1"] = ["HURT", "SUFFER", "DIFFIC", "TENS", "SPASM", "ANNOY", "BOTHER", "IRRIT", "SORE", "UNCOMFORT", "DISTRESS", "RAWNESS", "CRISI", "BAD", "WOUND", "HEAV", "SWOLLEN", "NUMB", "TINGL", "ACH", "ANGUISH", "AGONY", "STINGING", "BURN", "INJUR" "SENSITIVITY", "INFLAMMATION", "SHARP", "LANCINATING"]

In [26]:
# Add synonyms for class 2
diagnoses["class_2"] = ["QUEASINESS", "DISGUST", "DIZZ", "THROAT", "LIGHT-HEADED", "LIGHTHEADED", "GIDDY", "SICK", "VISCERAL", "VERTIGO", "REGURGITAT", "REFLUX", "ACID REFLUX",  "PUK", "BARF", "THROW", "THREW", "DIGEST",  "DISGORGE", "UPSET STOMACH", "STOMACH", "DYSPEPSIA", "RETCHING", "STOMACH ACHE", "THROWING UP", "QUEASY", "HEART BURN", "MOTION SICKNESS", "DRY HEAVE", "SICK TO MY STOMACH","DESSICC", "DRY", "DRI", "THIRST", "EXHAUST", "WATER LOSS", "REDUCED FLUID INTAKE", "LACK OF HYDRATION", "DEPLETION OF FLUIDS", "EXCESSIVE SWEATING", "LOSS OF ELECTROLYTES", "ELECTROLYTE","BURN", "LOSSE", "DYSENTERY", "BLOAT", "RECTUM", "ABDOM"]


In [27]:
# Add synonyms for class 3
diagnoses["class_3"] = ["SKIN", "YELLOW", "IRON-DEFICIENCY", "HYPOTHYROID", "JAUNDICE", "FATIGUE",  "LISTLESS ", "WEARINESS", "WEARY", "SLEEPY", "EXHAUST", 
               "WORN OUT", "LETHARG", "NO ENERGY", "TIRED", "IRON", "RED BLOOD CELL", 
               "HEMOGLOBIN", "HEMATOCRIT", "BLOOD", "DEFICIEN",  "PALLOR", "PALE SKIN", "PALPIT", "OXYGEN", 
               "SHORTNESS OF BREATH",  "COLD", "HEARTBEAT", 
               "RAPID HEART RATE", "DROWS", "SLEEPINESS", "WEAK", "DIZZ", "LIGHT-HEADED", "LIGHTHEADED"] 

**What lab values indicate anemia?**
The diagnostic criterion for anemia is

For men: Hemoglobin < 14 g/dL (140 g/L), hematocrit < 42% (< 0.42), or RBC < 4.5 million/mcL (< 4.5 × 10 12/L)

For women: Hemoglobin < 12 g/dL (120 g/l), hematocrit < 37% (< 0.37), or RBC < 4 million/mcL (< 4 × 10 12/L)

Source: https://www.merckmanuals.com/professional/hematology-and-oncology/approach-to-the-patient-with-anemia/evaluation-of-anemia

In [28]:
# Add synonyms for class 4 
diagnoses["class_4"] = ["FEBRI", "SWEAT", "CHILL", "SHIVER", "SHUDDER", "TREMBL",  "WASHED OUT", "ELEVATED BODY TEMPERATURE", "HIGH TEMPERATURE", "FLUSHING", "HOT FLASHES", "FLUSHED FACE", "GLASSY EYES","LEUKOPENIA", "LEUKOCYTE", "WHITE BLOOD CELL", "GRANULOCYTO", "WEAK", "FATIGUE", "BACTERIA", "FAINT",  "CONFUS",  "DISORIENT", "TOXIN", "TISSUE", "VIRUS", "INFECT","RESPIRAT", "VIRAL", "COUGH", "ENERGY",  "SHAKING CHILLS", "PULMONARY INFECTION", "DIFFICULTY BREATHING", "CHEST INFECTION", "RESPIRATORY INFECTION"]

In [29]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(3236, 3)

In [30]:
df.shape

(3236, 3)

**b) Add definition from UMLS**

In [31]:
diagnoses = {}

In [32]:
# Add definition for class 1
diagnoses["class_1"] = ["DAMAGE", "DISTRESS", "AGONY", "MISERY", "PRICK", "STING"]

In [33]:
# Add definition for class 2
diagnoses["class_2"] = ["URGE TO VOMIT", "UPPER ABDOMINAL DISCOMFORT", "THROAT", "FORCEFUL EJECTION", "EJECTION", "SPASM", "CONTRACIONS", "SPASMIC CONTRACTIONS", "EXPULSION", "FLUID", "FEELING VERY THIRSTY", "DRY MOUTH", "SWEAT", "DRY SKIN", "FEELING TIREDD", "DIZZINESS", "WATER LOSS", "WATERY BOWEL MOVEMENTS", "BOWEL", "BOWEL MOVEMENTS", "LOOSE", "WATERY", "LIQUID", "STOOL","CRAMP"]

In [34]:
# Add definition for class 3 
diagnoses["class_3"] = ["ERYTHROCYTES", "PALLOR OF THE SKIN", "MUCOUS MEMBRANES", "MUCOUS", "SHORTNESS OF BREATH", "SOB", "PALPILTATIONS", "LETHARGY", "SOFT SYSTOLIC MURMURS", "FATIGABILITY"]

In [35]:
# Add definition for class 4
diagnoses["class_4"] = ["BODY TEMPERATURE","SHORTNESS OF BREATH", "SOB", "CONFUSION", "DISORIENTATION", "RAPID BREATHING", "HEART RATE", "CLAMMY", "SWEATY", "SKIN", "LUNG", "PARENCHYMA", "INFLAMMATION", "CHILLS", "PHLEGM"]

In [36]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(3281, 3)

**c) Use existing dictionary from existing vocabularies from similar studies:**

Ressources: 
- https://github.com/KHP-Informatics/ADRApp/tree/master/application-resources/ADR

It has vocabulary for diarrhoa; nausea; vomiting; pneumonia and fever. There is also specific vocabularies like fatigue; headaches; muscle pain; numbness; stomach pain; adbominal pain;  but I didn't use them cause they are too precised in my opinion. A lot of the words are already captured by the seed terms and included by a) and b) 


In [37]:
diagnoses = {}

In [38]:
### Add words from ADRApp
# Class 2
diagnoses["class_2"] = ["FREQUENT STOOL"] 
# Class 4
diagnoses["class_4"] = ["FEELING HOT", "HIGH TEMPERATURE", "INCREASED TEMPERATURE", "RAISED BODY TEMPERATURE"]


In [39]:
for k,v in diagnoses.items():
    dict_words = df[df["DIAGNOSE"] == k].NAME.values.tolist()
    new_words = [*set([w.lower() for w in v])]
    new_words = list(set(new_words) - set(dict_words))
    df2 = pd.DataFrame.from_dict({"NAME": pd.Series(new_words), "DIAGNOSE": k})
    df = pd.concat([df, df2], axis=0)
df.shape

(3285, 3)

In [40]:
df.fillna({"TYPE": "additional"}, inplace=True)

In [41]:
len(df[df["DIAGNOSE"] == "class_1"]), len(df[df["DIAGNOSE"] == "class_2"]), len(df[df["DIAGNOSE"] == "class_3"]), len(df[df["DIAGNOSE"] == "class_4"])

(1120, 327, 488, 1350)

In [42]:
df.to_csv("../vocabulary/exhaustive_vocab.csv", index=False)

In [43]:
df

Unnamed: 0,NAME,DIAGNOSE,TYPE
0,pain,class_1,seed
1,ache,class_1,seed
2,discomfort,class_1,seed
3,radiculopathy,class_1,seed
4,tightness,class_1,seed
...,...,...,...
11,sweaty,class_4,additional
0,frequent stool,class_2,additional
0,feeling hot,class_4,additional
1,increased temperature,class_4,additional


## Vocabulary Filtering

- Load exhaustive vocabulary

In [44]:
exhaustive_vocab = pd.read_csv("../vocabulary/exhaustive_vocab.csv")

In [45]:
exhaustive_vocab

Unnamed: 0,NAME,DIAGNOSE,TYPE
0,pain,class_1,seed
1,ache,class_1,seed
2,discomfort,class_1,seed
3,radiculopathy,class_1,seed
4,tightness,class_1,seed
...,...,...,...
3280,sweaty,class_4,additional
3281,frequent stool,class_2,additional
3282,feeling hot,class_4,additional
3283,increased temperature,class_4,additional


In [46]:
exhaustive_vocab.DIAGNOSE.value_counts()

class_4    1350
class_1    1120
class_3     488
class_2     327
Name: DIAGNOSE, dtype: int64

- Load training notes

In [47]:
# Load relevant note table
sql_query = f""" SELECT * FROM {es}.note_by_type"""
note_table = (client.query(sql_query)).to_dataframe()

In [48]:
# Get training ids 
train_ids = pd.read_csv('../data_preprocessed/TrainTest/train/ids.csv').PAT_DEID.values
train_ids

array([1068036, 3278509, 1426036, ..., 2423073, 1465165,  542216])

In [49]:
# Get notes from training patients only
note_table = note_table[note_table["PAT_DEID"].isin(train_ids)]

In [50]:
print(f"Number of training patients = {len(pd.unique(note_table['PAT_DEID']))}")

Number of training patients = 1647


In [51]:
# Load outcome_table_1 
sql_query = f""" SELECT * FROM {es}.outcome_table_1"""
outcome_table_1 = (client.query(sql_query)).to_dataframe()

In [52]:
# Get labels of training patients only
outcome_table_1 = outcome_table_1[outcome_table_1["PAT_DEID"].isin(train_ids)]

In [53]:
print(f"Number of training patients = {len(pd.unique(note_table['PAT_DEID']))}")

Number of training patients = 1647


- Obtain text of patients with only one class

In [54]:
single_label_df = outcome_table_1[outcome_table_1["TOTAL"] == 1]
df_class_1 = single_label_df[single_label_df["CLASS_1"] == 1.0]
df_class_2 = single_label_df[single_label_df["CLASS_2"] == 1.0]
df_class_3 = single_label_df[single_label_df["CLASS_3"] == 1.0]
df_class_4 = single_label_df[single_label_df["CLASS_4"] == 1.0]

In [55]:
df_class_1.shape, df_class_2.shape, df_class_3.shape, df_class_4.shape

((281, 7), (88, 7), (75, 7), (175, 7))

In [56]:
note_class_1 = pd.merge(note_table, df_class_1, on ="PAT_DEID")
note_class_2 = pd.merge(note_table, df_class_2, on ="PAT_DEID")
note_class_3 = pd.merge(note_table, df_class_3, on ="PAT_DEID")
note_class_4 = pd.merge(note_table, df_class_4, on ="PAT_DEID")

In [57]:
note_class_1.shape, note_class_2.shape, note_class_3.shape, note_class_4.shape

((557, 12), (196, 12), (232, 12), (430, 12))

In [58]:
note_classes = [note_class_1, note_class_2, note_class_3, note_class_4]

### Remove terms absent in the training notes 

- Count occurence

In [59]:
total_vocab = pd.DataFrame()
for i, class_name in enumerate(["class_1", "class_2", "class_3", "class_4"]):
    note_class = note_classes[i]
    vocab = exhaustive_vocab[exhaustive_vocab["DIAGNOSE"] == class_name]
    names = vocab["NAME"]
    d = dict()
    for word in names: 
        d[word] = note_class['NOTE'].str.upper().apply(lambda x: any(y in x for y in [word.upper()])).sum()
    vocab["OCCURENCE"] = d.values()
    total_vocab = pd.concat([total_vocab, vocab[vocab["OCCURENCE"] != 0]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab["OCCURENCE"] = d.values()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab["OCCURENCE"] = d.values()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab["OCCURENCE"] = d.values()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [60]:
total_vocab.shape

(408, 4)

In [61]:
total_vocab = total_vocab.sort_values(["DIAGNOSE", "OCCURENCE"], ascending=False)
total_vocab

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
27,fever,class_4,seed,342
3273,skin,class_4,additional,331
122,oral,class_4,clinician,313
3222,infect,class_4,additional,306
135,cough,class_4,clinician,292
...,...,...,...,...
1115,radicular pain,class_1,snomedct_us,1
1185,pain score,class_1,snomedct_us,1
3035,h/a,class_1,abbreviation,1
3109,hurt,class_1,additional,1


In [62]:
## Save vocabulary 
# Specify destination for storing dataframe
destination = f"som-nero-phi-boussard.ES_ACU_Oncology.vocab_per_class_nested"

# Save file to Big Query
load_job = client.load_table_from_dataframe(dataframe = total_vocab,                                  
                                                   destination = destination,
                                                   job_config = job_config)

# Run the job:
load_job.result()

LoadJob<project=som-nero-phi-boussard, location=US, id=f4a5003b-1eb1-4b29-966d-d2938c89b2e7>

In [63]:
total_vocab.to_csv("../vocabulary/vocab_per_class_nested.csv", index=False)

### Remove nested terms

In [64]:
total_vocab = pd.read_csv("../vocabulary/vocab_per_class_nested.csv")

In [65]:
diagnoses = {
             "class_1": list(total_vocab[total_vocab["DIAGNOSE"] == "class_1"]["NAME"].values), 
             "class_2": list(total_vocab[total_vocab["DIAGNOSE"] == "class_2"]["NAME"].values), 
             "class_3": list(total_vocab[total_vocab["DIAGNOSE"] == "class_3"]["NAME"].values), 
             "class_4": list(total_vocab[total_vocab["DIAGNOSE"] == "class_4"]["NAME"].values)
            }

In [66]:
len(diagnoses["class_1"]), len(diagnoses["class_2"]), len(diagnoses["class_3"]), len(diagnoses["class_4"])

(158, 82, 48, 120)

In [67]:
## Get rid of redundant terms (i.e if term 1 contains term 2, there is no need to consider term 1)

for key, terms in diagnoses.items():
    terms = [str(x).lower() for x in terms if str(x).lower() != 'nan']
    terms = list(set(terms))

    still_included = True
    while still_included:
        still_included = False
        for y in terms:
            temp = [x for x in terms if y not in x]
            if len(temp) != len(terms) - 1:
                still_included = True
                terms = [y] + temp
                break

    ## Sanity check
    mat = np.zeros((len(terms), len(terms)))
    for i in range(len(terms)):
        for j in range(len(terms)):
            if terms[i] in terms[j]:
                mat[i,j] = 1
    assert(mat.sum() == mat.shape[0])
    
    # Update dict
    diagnoses[key] = terms

In [68]:
len(diagnoses["class_1"]), len(diagnoses["class_2"]), len(diagnoses["class_3"]), len(diagnoses["class_4"])

(52, 56, 35, 84)

In [69]:
vocab_per_class = pd.DataFrame()
for key, terms in diagnoses.items(): 
    vocab_per_class = pd.concat([vocab_per_class, pd.DataFrame({"NAME": terms, "DIAGNOSE": key})])

In [70]:
# Size of the vocabulary
vocab_per_class.shape[0]

227

In [71]:
vocab_per_class = pd.merge(vocab_per_class, total_vocab, how="left", on=["NAME", "DIAGNOSE"])
vocab_per_class = vocab_per_class.sort_values(["DIAGNOSE", "OCCURENCE"], ascending=False)
vocab_per_class

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
143,fever,class_4,seed,342
201,skin,class_4,additional,331
180,oral,class_4,clinician,313
157,infect,class_4,additional,306
145,cough,class_4,clinician,292
...,...,...,...,...
16,pounding,class_1,clinician,1
17,hurt,class_1,additional,1
26,h/a,class_1,abbreviation,1
31,excruciating,class_1,clinician,1


In [72]:
vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_3"].shape[0]

35

### Incorporate Clinician feedback

- Use results of the survey: 2 clinicians review the list and a word is said to be relevant if at least one of the clinician say it was relevant


In [73]:
#vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_1"]

# throat, burn for class 2
# skin, yellow, hypothyroid, jaundice for class 3
# skin for class 4

In [74]:
vocab_class_1 = vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_1"]
vocab_class_2 = vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_2"]
vocab_class_3 = vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_3"]
vocab_class_4 = vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_4"]
vocab_class_1.shape, vocab_class_2.shape, vocab_class_3.shape, vocab_class_4.shape

((52, 4), (56, 4), (35, 4), (84, 4))

In [75]:
vocab_class_4.drop(vocab_class_4.index[(vocab_class_4["NAME"].isin(["skin"]))], axis=0, inplace=True)
vocab_class_3.drop(vocab_class_3.index[(vocab_class_3["NAME"].isin(["skin", "yellow", "hypothyroid", "jaundice"]))], axis=0, inplace=True)
vocab_class_2.drop(vocab_class_2.index[(vocab_class_2["NAME"].isin(["throat", "burn"]))], axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_4.drop(vocab_class_4.index[(vocab_class_4["NAME"].isin(["skin"]))], axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_3.drop(vocab_class_3.index[(vocab_class_3["NAME"].isin(["skin", "yellow", "hypothyroid", "jaundice"]))], axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_2.drop(vocab_class_2.index[(vocab_class_2["NAME"].isin(["throat", "burn"]))], axis=0, inplace=True)


In [76]:
vocab_class_1.shape, vocab_class_2.shape, vocab_class_3.shape, vocab_class_4.shape

((52, 4), (54, 4), (31, 4), (83, 4))

In [77]:
vocab_per_class = pd.concat([vocab_class_1, vocab_class_2, vocab_class_3, vocab_class_4], axis=0)

In [78]:
vocab_per_class 

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
6,pain,class_1,seed,512
4,ach,class_1,additional,461
12,tens,class_1,additional,374
2,tender,class_1,clinician,369
49,numb,class_1,additional,353
...,...,...,...,...
174,acute bronchiolitis,class_4,snomedct_us,1
182,low white count,class_4,clinician,1
183,psi,class_4,abbreviation,1
204,ne,class_4,abbreviation,1


### Remove duplicate across classes

In [79]:
duplicates = vocab_per_class.duplicated(["NAME"])
vocab_per_class[duplicates]

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
72,cramp,class_2,additional,6
92,spasm,class_2,additional,5
123,dizz,class_3,additional,96
131,lightheaded,class_3,additional,75
130,light-headed,class_3,additional,2
119,light headed,class_3,clinician,1
202,cbc,class_4,abbreviation,220
211,shortness of breath,class_4,clinician,196
221,sob,class_4,additional,141
189,weak,class_4,additional,131


In [80]:
# check in which class this word appears
vocab_per_class[vocab_per_class["NAME"] == "cramp"]

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
20,cramp,class_1,seed,21
72,cramp,class_2,additional,6


- For duplicated words, calculate the frequency of this word in each class text and assign it to the class for which is has the highest frequency

In [81]:
relevant_note = note_class_1['NOTE'].str.upper().apply(lambda x: any(y in x for y in ["SPASM"]))
relevant_note.sum() / note_class_1.shape[0]

0.07540394973070018

In [82]:
relevant_note = note_class_2['NOTE'].str.upper().apply(lambda x: any(y in x for y in ["SWEAT"]))
relevant_note.sum() / note_class_2.shape[0]

0.14285714285714285

In [83]:
relevant_note = note_class_3['NOTE'].str.upper().apply(lambda x: any(y in x for y in ["FATIGUE"]))
relevant_note.sum() / note_class_3.shape[0]

0.28448275862068967

In [84]:
relevant_note = note_class_4['NOTE'].str.upper().apply(lambda x: any(y in x for y in ["FATIGUE"]))
relevant_note.sum() / note_class_4.shape[0]

0.24651162790697675

In [85]:
vocab_class_4.drop(vocab_class_4.index[(vocab_class_4["NAME"].isin(["weak", "fatigue", "sweat", ' cbc']))], axis=0, inplace=True)
vocab_class_3.drop(vocab_class_3.index[(vocab_class_3["NAME"].isin(["light-headed", "light headed", "shortness of breath", "sob", "dyspnea", "oxygen"]))], axis=0, inplace=True)
vocab_class_2.drop(vocab_class_2.index[(vocab_class_2["NAME"].isin(["dizz", "cramp", "spasm", "lightheaded"]))], axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_4.drop(vocab_class_4.index[(vocab_class_4["NAME"].isin(["weak", "fatigue", "sweat", ' cbc']))], axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_3.drop(vocab_class_3.index[(vocab_class_3["NAME"].isin(["light-headed", "light headed", "shortness of breath", "sob", "dyspnea", "oxygen"]))], axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vocab_class_2.drop(vocab_class_2.index[(vocab_class_2["NAME"].isin(["

In [86]:
vocab_class_1.shape, vocab_class_2.shape, vocab_class_3.shape, vocab_class_4.shape

((52, 4), (50, 4), (25, 4), (79, 4))

In [87]:
vocab_per_class = pd.concat([vocab_class_1, vocab_class_2, vocab_class_3, vocab_class_4], axis=0)

In [88]:
vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_4"][:30]

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
143,fever,class_4,seed,342
180,oral,class_4,clinician,313
157,infect,class_4,additional,306
145,cough,class_4,clinician,292
203,wbc,class_4,abbreviation,261
222,lung,class_4,additional,252
148,neut,class_4,seed,246
170,respirat,class_4,additional,243
147,chill,class_4,additional,235
226,cxr,class_4,abbreviation,199


- Save final vocabulary

In [89]:
## Save vocabulary 
# Specify destination for storing dataframe
destination = f"som-nero-phi-boussard.ES_ACU_Oncology.vocab_per_class"

# Save file to Big Query
load_job = client.load_table_from_dataframe(dataframe = vocab_per_class,                                  
                                                   destination = destination,
                                                   job_config = job_config)

# Run the job:
load_job.result()

LoadJob<project=som-nero-phi-boussard, location=US, id=4149f267-1d60-4d16-b3d0-3b2dab61b3cf>

In [90]:
vocab_per_class.to_csv("../vocabulary/vocab_per_class.csv", index=False)

In [91]:
vocab_per_class = pd.read_csv("../vocabulary/vocab_per_class.csv")

In [92]:
vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_4"][50:80]

Unnamed: 0,NAME,DIAGNOSE,TYPE,OCCURENCE
177,low-grade,class_4,clinician,13
178,faint,class_4,additional,10
179,leukocyte,class_4,additional,10
180,flushing,class_4,additional,9
181,pressors,class_4,clinician,8
182,difficulty breathing,class_4,additional,8
183,hypoxemia,class_4,clinician,7
184,energy,class_4,additional,7
185,map,class_4,abbreviation,6
186,phlegm,class_4,additional,5


**Extract Stats** 

In [93]:
print(f'Size of vocabulary for class_1: {vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_1"].shape[0]}')
print(f'Size of vocabulary for class_2: {vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_2"].shape[0]}')
print(f'Size of vocabulary for class_3: {vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_3"].shape[0]}')
print(f'Size of vocabulary for class_4: {vocab_per_class[vocab_per_class["DIAGNOSE"] == "class_4"].shape[0]}')

Size of vocabulary for class_1: 52
Size of vocabulary for class_2: 50
Size of vocabulary for class_3: 25
Size of vocabulary for class_4: 79


In [94]:
vocab_per_class.TYPE.value_counts()

additional      86
clinician       60
abbreviation    28
seed            27
snomedct_us      5
Name: TYPE, dtype: int64

In [95]:
# Number of words per type
print(f'Number of seed words: {vocab_per_class[vocab_per_class["TYPE"] == "seed"].shape[0]}')
print(f'Number of clinicians terms: {vocab_per_class[vocab_per_class["TYPE"] == "clinician"].shape[0]}')
print(f'Number of SNOMED associated terms: {vocab_per_class[vocab_per_class["TYPE"] == "snomedct_us"].shape[0]}')
print(f'Number of abbreviations: {vocab_per_class[vocab_per_class["TYPE"] == "abbreviation"].shape[0]}')
print(f'Number of additional terms: {vocab_per_class[vocab_per_class["TYPE"] == "additional"].shape[0]}')

Number of seed words: 27
Number of clinicians terms: 60
Number of SNOMED associated terms: 5
Number of abbreviations: 28
Number of additional terms: 86
