# Characterization of Parkinson's Disease through clustering of Medical notes

## Introduction

### Importing packages

In [73]:
import numpy as np
import pandas as pd
import json
import umap
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import RegexpTokenizer
from gap_statistic import OptimalK

## Data Pre-processing

### Exploring the data

In [34]:
path = "data/HEHE.csv"
available_memory = 423464092  # Memory in bytes
memory_per_row = 100  # Example memory usage per row in bytes
target_memory_usage = 0.75 * available_memory
chunk_size = int(target_memory_usage / memory_per_row)
chunk_size

3175980

In [41]:
# chunksize = 10_000_000
chunk = pd.read_csv(path, chunksize=chunksize, dtype={'dose_unit_source_value': str}) #Raises a DtypeWarning: Columns (11) have mixed types when it is not specified
df = pd.concat(chunk)

In [178]:
df.sample(10)

Unnamed: 0,person_id,drug_source_value,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
909765,250605,"{""med_display_name"": ""amiodarone (CORDARONE) 9...",0.0,CONTINUOUS,INTRAVENOUS,0.0,mg/min,"[**NAME**], MD [**DATE**] 4:49 PM Please ..."
1519082,220757,"{""med_display_name"": ""LORazepam (ATIVAN) injec...",0.5,2 TIMES DAILY,INTRAMUSCULAR,1.0,mg,"[**NAME**], MD [**DATE**] 11:27 AM Departm..."
2292800,178928,"{""med_display_name"": ""0.9% NaCl flush 10 mL"", ...",10.0,PRN,INTRACATHETER,10.0,mL,[**NAME**] [**DATE**] 10:15 AM Orthotic Co...
1234335,241547,"{""med_display_name"": ""insulin regular (HumuLIN...",0.0066,CONTINUOUS,INTRAVENOUS,1.2,Units/hr,"""[**NAME**], MD [**DATE**] 6:46 PM Infect..."
1337669,226132,"{""med_display_name"": ""sodium phosphate 24 mmol...",195.1872,CONTINUOUS,INTRAVENOUS,2.496,mmol/hr,"""[**NAME**], MD [**DATE**] 7:42 AM Psychi..."
55640,258549,"{""med_display_name"": ""heparin 25,000 units in ...",39.101218,CONTINUOUS,INTRAVENOUS,24.0,Units/kg/hr,"[**NAME**], RN [**DATE**] 10:32 AM Consult..."
1282267,236824,"{""med_display_name"": ""busPIRone (BUSPAR) table...",1.5,2 TIMES DAILY,ORAL,7.5,mg,"[**NAME**], MD [**DATE**] 7:07 AM Vascula..."
2240309,187199,"{""med_display_name"": ""mupirocin (BACTROBAN) oi...",1.0,2 TIMES DAILY,NASAL,1.0,application,"""[**NAME**], PhD [**DATE**] 10:36 AM REHAB..."
649424,256708,"{""med_display_name"": ""hydrALAZINE (APRESOLINE)...",0.5,EVERY 6 HOURS PRN,INTRAVENOUS,10.0,mg,"[**NAME**], MD [**DATE**] 3:32 PM admitted"
1156899,246383,"{""med_display_name"": ""prednisoLONE (ORAPRED) o...",2.5,DAILY,ORAL,7.5,mg,"""[**NAME**], MD [**DATE**] 11:02 PM Depart..."


#### Number of patients in this cohort

In [40]:
df['person_id'].unique().shape

(441,)

#### Dealing with the drugs_table column

Because the drug_source_value table contains elements as json, we convert them into a dict so that we can use their original keys

In [183]:
drugs_table = df['drug_source_value'].apply(lambda x: json.loads(x))

Actual len of the whole dataset. It has more than 2 Million rows

In [184]:
len(drugs_table)

2326300

The keys of the dictionary are those used in the drug_source_value column

In [185]:
drugs_table[0].keys()

dict_keys(['med_display_name', 'rxnorm_concat', 'med_dose_unit_desc', 'mar_action', 'med_order_desc'])

Current medication used for the treatment of Parkinson's disease

In [186]:
pd_medication = ["carbidopa", "levodopa", "entacapone", "tolcapone", "opicapone", "pramipexole", "ropinirole", "apomorphine", "rotigotine", "selegiline", "rasagiline", "safinamide", "amantadine", "istradefylline", "trihexyphenidyl", "benztropine", "bromocriptine", "cabergoline", "pergolide", "lisuride"]
pd_drug_info = {}

We use the drugs_table names in pd_medication to look for it in the drugs_table array by using the key med_display_name. This key contains the drug name

In [187]:
for i, drug in enumerate(drugs_table):
    for item in pd_medication:
        if item in drugs_table[i]['med_display_name']:
            pd_drug_info[i] = drugs_table[i]['med_display_name'] # I could have also passed the whole dictionary

Only 16K+ rows have information regarding medication for Parkinson's

In [188]:
len(pd_drug_info)

16040

The keys of this new array are the indixes or rows in the original DataFrame

In [189]:
pd_drug_info.keys()

dict_keys([126, 127, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 2353, 2354, 2355, 2438, 2439, 2521, 2522, 2523, 2524, 2525, 2706, 2707, 2708, 2709, 2710, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 3496, 3497, 3498, 3499, 3500, 3501, 350

pd_drug_info is a subset of drugs_table that contains the name of drugs related to PD

In [191]:
pd_drug_info[955]

'amantadine (SYMMETREL) capsule 100 mg'

In [192]:
drugs_table[955]

{'med_display_name': 'amantadine (SYMMETREL) capsule 100 mg',
 'rxnorm_concat': '| 849389 |',
 'med_dose_unit_desc': 'mg',
 'mar_action': 'GIVEN',
 'med_order_desc': 'AMANTADINE HCL 100 MG PO CAPS'}

Here's our new database

In [176]:
df.iloc[list(pd_drug_info.keys())]

Unnamed: 0,person_id,drug_source_value,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
126,261006,"{""med_display_name"": ""carbidopa-levodopa (SINE...",2.0,EVERY 4 HOURS,ORAL,2.0,tablet,"""[**NAME**], MD [**DATE**] 7:04 PM Depart..."
127,261006,"{""med_display_name"": ""carbidopa-levodopa (SINE...",2.0,EVERY 4 HOURS,ORAL,2.0,tablet,"""[**NAME**] [**DATE**] 9:51:05 AM Requ..."
955,260996,"{""med_display_name"": ""amantadine (SYMMETREL) c...",1.0,2 TIMES DAILY,ORAL,100.0,mg,"[**NAME**], ORTHOTIST-PR [**DATE**] 6:02 ..."
956,260996,"{""med_display_name"": ""amantadine (SYMMETREL) c...",1.0,EVERY OTHER DAY,ORAL,100.0,mg,"""[**NAME**], RD [**DATE**] 11:30 AM Nutrit..."
957,260996,"{""med_display_name"": ""amantadine (SYMMETREL) c...",1.0,2 TIMES DAILY,ORAL,100.0,mg,[**NAME**] [**DATE**] 7:56 PM Psychology ...
...,...,...,...,...,...,...,...,...
2324745,175605,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,3 TIMES DAILY,ORAL,0.0,tablet,"""[**NAME**], MD [**DATE**] 1:12 PM Geriat..."
2324746,175605,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,3 TIMES DAILY,ORAL,0.0,tablet,"""[**NAME**] [**DATE**] 11:30 PM Rehab Psyc..."
2324747,175605,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,3 TIMES DAILY,ORAL,0.0,tablet,"""[**NAME**], RD [**DATE**] 12:48 AM Nutrit..."
2324748,175605,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,3 TIMES DAILY,ORAL,0.0,tablet,"""[**NAME**], DO [**DATE**] 10:14 AM Neurol..."


### Creating the corpus from medical notes 

#### Using the note_text column for raw data

In [None]:
corpus_raw = list(df["note_text"])

#### Removing words that don't contain much meaning from our notes

In [None]:
words_to_remove = ["Department of Neurosurgery Date of Consult", "Department of Orthopedics Consultation Note Date of Consult", "Geriatric Medicine Consult Date of Consult", "INPATIENT MEDICAL NUTRITION THERAPY", "MSW", "RN" ,"evidence", "Read By", "images", "report", "concur", "findings", "agree", "seen", "residents", "resident", "Resident", "unspecified provider", "Released Date Time", "personally reviewed" ,"D.O", "MD", "M.D.", "Electronically Verified By", "NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]
# words_to_remove = ["NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]

# words_to_remove = words_to_remove + list(STOPWORDS)
corpus_clean = []
for item in corpus_raw:
    for word in words_to_remove:
        item = item.replace(word, '') 
    corpus_clean.append(item)

In [None]:
df['note_text'].iloc[989]

#### Taking each word from the cleaned corpus and making it lowercase

In [None]:
corpus = [word.lower() for word in corpus_clean]
corpus[989]

#### Adding the pre-processed version of the notes to the DataFrame

In [None]:
df["note_text"] = corpus
df