# Large Language Models can better predict optimal medication change in Parkinson's Disease through Medical notes

## Introduction

TODO

### Importing packages

In [1]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches

## Data Pre-processing

### Exploring the data

In [2]:
path = "data/HEHE.csv" 
available_memory = 423464092  # Memory in bytes
memory_per_row = 100  # Example memory usage per row in bytes
target_memory_usage = 0.75 * available_memory
chunk_size = int(target_memory_usage / memory_per_row)
chunk_size

3175980

In [3]:
# Actual way to read large CSV files
chunk = pd.read_csv(path, chunksize=chunk_size, dtype={'dose_unit_source_value': str}) #Raises a DtypeWarning: Columns (11) have mixed types when it is not specified
df = pd.concat(chunk)

In [4]:
drug_exposure = pd.read_csv('data/TRY.csv')

Loading demographic info

In [5]:
demographics = pd.read_csv('data/DEMOGRAPHICS.csv')
# demographics_all contains age and gender info for all the patients
demographics_all = pd.read_csv('data/DEMOGRAPHICS_ALL.csv')

In [6]:
df.sample(5)

Unnamed: 0,person_id,drug_source_value,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
1872183,213065,"{""med_display_name"": ""traZODone (DESYREL) tabl...",1.5,NIGHTLY,ORAL,150.0,mg,"[**NAME**], MD [**DATE**] 9:00 PM Neurolo..."
1531620,220757,"{""med_display_name"": ""LR bolus 1,000 mL"", ""rxn...",0.0,BOLUS ONCE,INTRAVENOUS,0.0,mL,"[**NAME**], MD [**DATE**] 10:38 AM Body IR..."
1830933,213526,"{""med_display_name"": ""docusate (COLACE) oral l...",10.0,2 TIMES DAILY PRN,PER J TUBE,100.0,mg,"[**NAME**], RD [**DATE**] 10:01 AM Calorie..."
2058038,201787,"{""med_display_name"": ""amantadine (SYMMETREL) s...",0.0,2 TIMES DAILY,ORAL,0.0,mg,"[**NAME**], MD [**DATE**] 2:11 PM Departm..."
801596,254942,"{""med_display_name"": ""furosemide (LASIX) injec...",1.0,ONCE NOW,INTRAVENOUS,40.0,mg,"[**NAME**], RN [**DATE**] 12:43 PM Consult..."


Actual len of the whole dataset. It has more than 2 Million rows

In [7]:
len(df)

2326300

#### Number of patients in this cohort

In [8]:
df['person_id'].nunique()

441

#### Dealing with the drug_source_value column

Because the drug_source_value column contains elements as json, we convert them into a dict so that we can use their original keys:

In [9]:
drug_source_value = df['drug_source_value'].apply(lambda x: json.loads(x))

We do the same for the ´drug_source_value´ column in the drug_exposure database

In [10]:
dsv_drug_exposure = drug_exposure['drug_source_value'].apply(lambda x: json.loads(x))

dsv_drug_exposure has already been filtered and contains Parkinson's drugs

In [11]:
dsv_drug_exposure

0        {'med_display_name': 'carbidopa-levodopa (SINE...
1        {'med_display_name': 'carbidopa-levodopa (SINE...
2        {'med_display_name': 'carbidopa-levodopa (SINE...
3        {'med_display_name': 'carbidopa-levodopa (SINE...
4        {'med_display_name': 'carbidopa-levodopa (SINE...
                               ...                        
26941    {'med_display_name': 'carbidopa-levodopa (SINE...
26942    {'med_display_name': 'carbidopa-levodopa (SINE...
26943    {'med_display_name': 'carbidopa-levodopa (SINE...
26944    {'med_display_name': 'carbidopa-levodopa (SINE...
26945    {'med_display_name': 'carbidopa-levodopa (SINE...
Name: drug_source_value, Length: 26946, dtype: object

drug_source_value has the same len as the original df. The positions of the dictionary are the rows in the OG df 

In [12]:
drug_source_value[12] == json.loads(df.iloc[12]['drug_source_value'])

True

The keys of the dictionary are those used in the drug_source_value column

In [13]:
drug_source_value[0].keys()

dict_keys(['med_display_name', 'rxnorm_concat', 'med_dose_unit_desc', 'mar_action', 'med_order_desc'])

In [14]:
drug_source_value[12]

{'med_display_name': '0.9 % NaCl infusion',
 'rxnorm_concat': '| 313002 |',
 'med_dose_unit_desc': 'ml/hr',
 'mar_action': 'NEW BAG',
 'med_order_desc': 'SODIUM CHLORIDE 0.9 % IV SOLN'}

Current medication used for the treatment of Parkinson's disease

In [15]:
# Defining the generic name of drugs used to treat Parkinson's disease
pd_medication = ["carbidopa", "levodopa", "entacapone", "tolcapone", "opicapone", "pramipexole", "ropinirole", "apomorphine", "rotigotine", "selegiline", "rasagiline", "safinamide", "amantadine", "istradefylline", "trihexyphenidyl", "benztropine", "bromocriptine", "cabergoline", "pergolide", "lisuride"]

We look for instances of the Parkinson's drugs contained in the pd_medication array in drug_source_value so that we can focus on the medical notes relating to Parkinson's.

In [16]:
pd_drug_info = {}
for i, drug in enumerate(drug_source_value):
    for item in pd_medication:
        # I'm still not sure why I can use drug as a dict here. Answer because drug_source_value is a dict
        if item in drug['med_display_name']:
            pd_drug_info[i] = drug['med_display_name'] # I could have also passed the whole dictionary

Something similar

In [17]:
drug_info_drug_source_value = {}
for i, drug in enumerate(dsv_drug_exposure):
    drug_info_drug_source_value[i] = drug['med_display_name'] # I could have also passed the whole dictionary

In [18]:
pd_drug_info

{126: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 127: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 955: 'amantadine (SYMMETREL) capsule 100 mg',
 956: 'amantadine (SYMMETREL) capsule 100 mg',
 957: 'amantadine (SYMMETREL) capsule 100 mg',
 958: 'amantadine (SYMMETREL) capsule 100 mg',
 959: 'amantadine (SYMMETREL) capsule 100 mg',
 960: 'amantadine (SYMMETREL) capsule 100 mg',
 961: 'amantadine (SYMMETREL) capsule 100 mg',
 962: 'amantadine (SYMMETREL) capsule 100 mg',
 963: 'amantadine (SYMMETREL) capsule 100 mg',
 964: 'amantadine (SYMMETREL) capsule 100 mg',
 965: 'amantadine (SYMMETREL) capsule 100 mg',
 966: 'amantadine (SYMMETREL) capsule 100 mg',
 967: 'amantadine (SYMMETREL) capsule 100 mg',
 968: 'amantadine (SYMMETREL) capsule 100 mg',
 969: 'amantadine (SYMMETREL) capsule 100 mg',
 970: 'amantadine (SYMMETREL) solution 100 mg',
 971: 'amantadine (SYMMETREL) solution 100 mg',
 972: 'amantadine (SYMMETREL) solution 100 mg',
 973: 'amantadi

In [19]:
drug_info_drug_source_value

{0: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 1: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 2: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 3: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 4: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 5: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 6: 'carbidopa-levodopa (SINEMET) 25-100 MG per tablet 2 tablet',
 7: 'amantadine (SYMMETREL) capsule 100 mg',
 8: 'amantadine (SYMMETREL) capsule 100 mg',
 9: 'amantadine (SYMMETREL) capsule 100 mg',
 10: 'amantadine (SYMMETREL) capsule 100 mg',
 11: 'amantadine (SYMMETREL) capsule 100 mg',
 12: 'amantadine (SYMMETREL) capsule 100 mg',
 13: 'amantadine (SYMMETREL) capsule 100 mg',
 14: 'amantadine (SYMMETREL) capsule 100 mg',
 15: 'amantadine (SYMMETREL) capsule 100 mg',
 16: 'amantadine (SYMMETREL) capsule 100 mg',
 17: 'amantadine (SYMMETREL) capsule 100 mg',
 18: 'amantadine (SYMMETREL) capsul

Only 16K+ rows have information regarding medication for Parkinson's

In [20]:
len(pd_drug_info)

16040

The keys of this new array are the indices or rows in the original DataFrame. These rows contain information regarding Parkinson's medication 

In [21]:
pd_drug_info.keys()

dict_keys([126, 127, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 2353, 2354, 2355, 2438, 2439, 2521, 2522, 2523, 2524, 2525, 2706, 2707, 2708, 2709, 2710, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 3496, 3497, 3498, 3499, 3500, 3501, 350

pd_drug_info is a subset of drug_source_value that contains the name of drugs related to PD

In [22]:
pd_drug_info[1299166]

'cabergoline (DOSTINEX) tablet 1 mg'

In [23]:
drug_source_value[1299166]

{'med_display_name': 'cabergoline (DOSTINEX) tablet 1 mg',
 'rxnorm_concat': '| 199703 |',
 'med_dose_unit_desc': 'mg',
 'mar_action': 'GIVEN',
 'med_order_desc': 'CABERGOLINE 0.5 MG PO TABS'}

Here's our new database. df_pd is a subset of the original dataset that only contains patients that are taking medication for Parkinson's

In [24]:
df_pd = df.iloc[list(pd_drug_info.keys())]
df_pd.sample(10)

Unnamed: 0,person_id,drug_source_value,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
1420997,220757,"{""med_display_name"": ""amantadine (SYMMETREL) c...",2.0,EVERY MORNING,ORAL,200.0,mg,"""[**NAME**], APRN [**DATE**] 4:50 PM Depa..."
1129966,248403,"{""med_display_name"": ""carbidopa-levodopa (SINE...",1.0,5 TIMES DAILY,ORAL,1.0,tablet,"""[**NAME**] RD [**DATE**] 10:51 AM Medical..."
1299886,234554,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,3 TIMES DAILY,ORAL,0.0,tablet,"[**NAME**], PA-C [**DATE**] 12:45 PM We ar..."
2021720,203750,"{""med_display_name"": ""carbidopa-levodopa (SINE...",2.0,5 TIMES DAILY,ORAL,2.0,tablet,"""[**NAME**], MD [**DATE**] 6:50 PM Inpati..."
6736,260680,"{""med_display_name"": ""carbidopa-levodopa (SINE...",1.5,EVERY 2 HOURS WHILE AWAKE,PER G TUBE,1.5,tablet,"""[**NAME**], MD [**DATE**] 2:58 PM Depart..."
1439997,220757,"{""med_display_name"": ""carbidopa-levodopa (SINE...",1.5,3 TIMES DAILY,ORAL,1.5,tablet,"[**NAME**], MD [**DATE**] 5:08 PM Departm..."
1731422,216583,"{""med_display_name"": ""carbidopa-levodopa (SINE...",0.0,2 TIMES DAILY,ORAL,0.0,tablet,"""[**NAME**], DO [**DATE**] 3:02 PM Depart..."
2091427,201023,"{""med_display_name"": ""carbidopa-levodopa-entac...",1.0,5 TIMES DAILY,ORAL,1.0,tablet,"""[**NAME**], MD [**DATE**] 11:07 AM Psychi..."
2277191,179838,"{""med_display_name"": ""carbidopa-levodopa (SINE...",2.5,Every 3 hours while awake,ORAL,2.5,tablet,"""[**NAME**], DO [**DATE**] 3:25 PM Psychi..."
1121582,248844,"{""med_display_name"": ""trihexyphenidyl (ARTANE)...",0.5,2 TIMES DAILY RESP,ORAL,1.0,mg,"[**NAME**], MD [**DATE**] 2:30 PM Accepte..."


Inserting the column drug_info into the pd dataset

In [25]:
df_pd.insert(loc=2, column="drug_info", value=pd_drug_info)

In [26]:
df_pd.sample(5)

Unnamed: 0,person_id,drug_source_value,drug_info,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
2149848,194868,"{""med_display_name"": ""amantadine (SYMMETREL) c...",amantadine (SYMMETREL) capsule 100 mg,1.0,2 TIMES DAILY,ORAL,100.0,mg,"""[**NAME**], MD [**DATE**] 6:55 PM ..."
16099,260268,"{""med_display_name"": ""pramipexole (MIRAPEX) ta...",pramipexole (MIRAPEX) tablet 0.5 mg,1.0,3 TIMES DAILY,ORAL,0.5,mg,"[**NAME**], MD [**DATE**] 2:09 AM Departm..."
2051895,202019,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,2.0,4 TIMES DAILY,ORAL,2.0,tablet,"[**NAME**], MD [**DATE**] 7:20 AM Departm..."
2239997,187199,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,2.0,USER SPECIFIED,ORAL,2.0,tablet,"""[**NAME**], PhD [**DATE**] 10:36 AM REHAB..."
2123253,198429,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,1.0,DAILY,ORAL,1.0,tablet,"""[**NAME**], RD [**DATE**] 3:23 PM INPATI..."


In [27]:
df_pd.iloc[955]

person_id                                                            257075
drug_source_value         {"med_display_name": "carbidopa-levodopa (SINE...
drug_info                 carbidopa-levodopa (SINEMET) 12.5-125 MG per t...
quantity                                                                0.5
sig                                                           4 TIMES DAILY
route_source_value                                                     ORAL
dose_source_value                                                       0.5
dose_unit_source_value                                               tablet
note_text                 "[**NAME**], MD     [**DATE**]  9:56 PM [**LOC...
Name: 92436, dtype: object

Let's take a look at the drugs actually used in the dataset. **Note**: vscode truncakes the ouput so it could be that you won't see all the drugs. i.e: Cabergoline (1299166).

In [28]:
# pd_drugs_used is just the list version of lowercase values in pd_drug_info
pd_drugs_used = [item.lower() for item in pd_drug_info.values()]
# Takes the unique values and sorts them to finally be saved in an array
pd_drugs_used_unique = sorted(set(pd_drugs_used))

We do the same for the drugs in dsv_drug_exposure

In [29]:
drugs_used_drug_exposure = [item.lower() for item in drug_info_drug_source_value.values()]
drugs_used_drug_exposure_unique = sorted(set(drugs_used_drug_exposure))

**Note:** 1299166 is **NOT** the position in the array but rather the key that contains the element. The len for both structures in 16040  

In [30]:
pd_drug_info[1299166]

'cabergoline (DOSTINEX) tablet 1 mg'

The position in the list that contains item with the key 1299166

In [31]:
list(pd_drug_info.keys()).index(1299166)

7778

In [32]:
pd_drugs_used

['carbidopa-levodopa (sinemet) 25-100 mg per tablet 2 tablet',
 'carbidopa-levodopa (sinemet) 25-100 mg per tablet 2 tablet',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel) solution 100 mg',
 'benztropine (cogentin) tabl

In [33]:
pd_drugs_used_unique

['* rasagiline mesylate (azilect) 1 mg - pharmacist to verify when brought in',
 '* rasagiline mesylate (azilect) tablet 1 mg - patient to bring and pharmacist to verify',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '*amantadine er (gocovri) extended release capsule 137 mg - patient supplied',
 '*carbidopa-levodopa er (rytary) 36.25-145 mg cpcr 1 tablet - patient supplied',
 '*carbidopa-levodopa er (rytary) 48.75-195 mg cpcr 1 capsule - patient supplied',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 200 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel) solution 200 mg',
 'amantadine (symmetrel) solution 50 mg',
 'amantadine (symmetrel) syrup 100 mg',
 'amantadine (symmetrel) syrup 50 mg',
 'amantadine er (gocovri) extended release capsule 137 mg',
 'apomorphine (apokyn) injection 1 ml',
 'benztropine (cogentin) tablet 0.5 mg',
 'benztropine (cogentin) tablet 1 mg',
 'benztropine (cogentin) tablet 2 mg

In [34]:
drugs_used_drug_exposure_unique

['* rasagiline mesylate (azilect) 1 mg - pharmacist to verify when brought in',
 '* rasagiline mesylate (azilect) tablet 1 mg - patient to bring and pharmacist to verify',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '*amantadine er (gocovri) extended release capsule 137 mg - patient supplied',
 '*carbidopa-levodopa er (rytary) 36.25-145 mg cpcr 1 tablet - patient supplied',
 '*carbidopa-levodopa er (rytary) 48.75-195 mg cpcr 1 capsule - patient supplied',
 '*carbidopa-levodopa er 36.25-145 mg cpcr 1 capsule-patient supplied med',
 '*carbidopa-levodopa er 48.75-195 mg cpcr 1 capsule - patient supplied',
 '*carbidopa-levodopa er 48.75-195 mg cpcr 1 capsule-patient supplied',
 '*carbidopa-levodopa er 48.75-195 mg cpcr 1 capsule-patient supplied med ',
 '*ropinirole (requip xl) 24 hr tablet patient supplied',
 'amantadine (symmetrel) capsule 100 mg',
 'amantadine (symmetrel) capsule 200 mg',
 'amantadine (symmetrel) solution 100 mg',
 'amantadine (symmetrel

Using a ***Regex*** to segment from the pd_drugs_used array the drug_name, generic_name and dosage

In [35]:
# Regex: ? 0 o 1 for the last element. + 1 or more. 
# Matches generic name (Word characters followed by either a space or hyphens) and brand name (A name)
drug_names_pattern = r"([\w\s-]+)\s(?:\(([\w\s-]+)\)\s*)?" 
dosage_pattern = r"\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)*(?:\s*(?:mg/ml|mg|ml))(?:/hr)?"

# Initialize empty lists
generic_names = []
brand_names = []
dosages = []

# Extract information and populate the lists
for string in pd_drugs_used:
    drug_name_match = re.findall(drug_names_pattern, string)
    if drug_name_match:
        generic_name, brand_name = drug_name_match[0]
        generic_names.append(generic_name)

        if brand_name:
            brand_names.append(brand_name)
        else:
            brand_names.append(np.nan)

        dosage_match = re.findall(dosage_pattern, string)
        
        if dosage_match:
            dosages.append(dosage_match[0])
        else:
            dosages.append(np.nan)
    else:
        # Nice way to remove outlier and keeping the length the same
        print(string)
        generic_names.append(np.nan)


# Create DataFrame
pd_data = pd.DataFrame({
    "generic_name": generic_names,
    "brand_name": brand_names,
    "dosage": dosages
})

*Regex* for drug_exposure

In [36]:
# Regex: ? 0 o 1 for the last element. + 1 or more. 
# Matches generic name (Word characters followed by either a space or hyphens) and brand name (A name)
drug_names_pattern = r"([\w\s-]+)\s(?:\(([\w\s-]+)\)\s*)?" 
dosage_pattern = r"\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)*(?:\s*(?:mg/ml|mg|ml))(?:/hr)?"

# Initialize empty lists
generic_names = []
brand_names = []
dosages = []

# Extract information and populate the lists
for string in drugs_used_drug_exposure:
    drug_name_match = re.findall(drug_names_pattern, string)
    if drug_name_match:
        generic_name, brand_name = drug_name_match[0]
        generic_names.append(generic_name)

        if brand_name:
            brand_names.append(brand_name)
        else:
            brand_names.append(np.nan)

        dosage_match = re.findall(dosage_pattern, string)
        
        if dosage_match:
            dosages.append(dosage_match[0])
        else:
            dosages.append(np.nan)
    else:
        # Nice way to remove outlier and keeping the length the same
        print(string)
        generic_names.append(np.nan)


# Create DataFrame
pd_data_drug_exposure = pd.DataFrame({
    "generic_name": generic_names,
    "brand_name": brand_names,
    "dosage": dosages
})

In [37]:
pd_data.sample(20)

Unnamed: 0,generic_name,brand_name,dosage
12822,carbidopa-levodopa,sinemet,25-100 mg
5346,benztropine,cogentin,1 mg
4879,carbidopa-levodopa,sinemet,25-100 mg
11081,carbidopa-levodopa,sinemet cr,25-100 mg
5793,carbidopa-levodopa,sinemet,25-100 mg
7345,carbidopa-levodopa,sinemet,25-100 mg
1070,carbidopa-levodopa,sinemet,25-250 mg
2532,benztropine,cogentin,1 mg
10550,carbidopa-levodopa,sinemet,25-100 mg
6771,trihexyphenidyl,artane,1 mg


In [38]:
print(pd_data['generic_name'].to_string())

0                                carbidopa-levodopa
1                                carbidopa-levodopa
2                                        amantadine
3                                        amantadine
4                                        amantadine
5                                        amantadine
6                                        amantadine
7                                        amantadine
8                                        amantadine
9                                        amantadine
10                                       amantadine
11                                       amantadine
12                                       amantadine
13                                       amantadine
14                                       amantadine
15                                       amantadine
16                                       amantadine
17                                       amantadine
18                                       amantadine
19          

In [39]:
pd_data_drug_exposure.sample(20)

Unnamed: 0,generic_name,brand_name,dosage
1622,pramipexole,mirapex,1.5 mg
16237,ropinirole,requip,2 mg
1079,carbidopa-levodopa,sinemet,25-250 mg
9615,carbidopa-levodopa,sinemet,25-100 mg
24665,carbidopa-levodopa,sinemet,25-100 mg
8514,carbidopa-levodopa,sinemet cr,25-100 mg
17540,carbidopa-levodopa,sinemet cr,50-200 mg
10244,ropinirole,requip,0.5 mg
15176,carbidopa-levodopa,sinemet,25-100 mg
12442,carbidopa-levodopa,sinemet,25-100 mg


Error or inconsistency. I might look into this later

In [40]:
# There's an inconsistency with this. The RxCUI code associated with it (885205) actually shows 1 mg not 0.5. See: https://mor.nlm.nih.gov/RxNav/search?searchBy=RXCUI&searchTerm=885205
df_pd['drug_source_value'].iloc[3013]

'{"med_display_name": "benztropine mesylate (COGENTIN) injection 0.5 mg", "rxnorm_concat": "| 885205 |", "med_dose_unit_desc": "mg", "mar_action": "GIVEN", "med_order_desc": "BENZTROPINE MESYLATE 1 MG/ML IJ SOLN"}'

In [41]:
pd_data.iloc[3013]

generic_name    benztropine mesylate
brand_name                  cogentin
dosage                        0.5 mg
Name: 3013, dtype: object

In [42]:
# Notice that for selegiline the generic_name still contains the dosage and form of administration. This is because, for selegiline, the name string doesn't contain the brand name   
pd_data.iloc[13628]

generic_name    selegiline tablet 5 mg
brand_name                         NaN
dosage                            5 mg
Name: 13628, dtype: object

#### Dealing with missing data

In [43]:
# Taking a look at the rows where there are missing values 
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

                                     generic_name brand_name         dosage
3333               carbidopa-levodopa patient own        NaN            NaN
3334               carbidopa-levodopa patient own        NaN            NaN
3335               carbidopa-levodopa patient own        NaN            NaN
5187                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5188                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5189                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5190                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5191                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5192                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5193                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5194                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5195                           carbidopa-levodopa        NaN  4.63-20 mg/ml
5196        

Using the pd_drugs_used_unique list to fill missing values (duopa)

In [44]:
pd_drugs_used[5187:5199]

['**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med']

In [45]:
pd_drugs_used[7346:7390]

['inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodopa-carbidopa intestinal gel 20-5mg/ml',
 'inv levodo

In [46]:
# Regex couldn't catch the brand name because it was in a different position, so we add it manually
pd_data.loc[pd_data.index[5187:5199], 'brand_name'] = 'duopa'
# Patient is taking LCIG (Levodopa-cabidopa intestinal gel), most likely it is duopa (this is an assumption)
# TODO: Maybe replace this with Dopamine Replacement Therapy (Listen again to Dr. K's audio)
pd_data.loc[pd_data.index[7346:7390], 'brand_name'] = 'duopa'
# pd_data.loc[pd_data.index[7712:7725], 'brand_name'] = 'duopa' #durg_exposure_file

In [47]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

                                     generic_name brand_name dosage
3333               carbidopa-levodopa patient own        NaN    NaN
3334               carbidopa-levodopa patient own        NaN    NaN
3335               carbidopa-levodopa patient own        NaN    NaN
7368   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7369   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7370   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7371   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7372   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7373   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7374   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7375   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7376   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7377   inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7378   inv levodopa-carbidopa intestinal gel pum

Looking for evidence of other brand names for Selegiline in our database

In [48]:
selegiline = pd_data[(pd_data['brand_name'] == 'eldepryl')| (pd_data['brand_name'] == 'zelapar')]
selegiline['brand_name'].unique()

array(['eldepryl'], dtype=object)

In [49]:
print(selegiline.to_string())

      generic_name brand_name dosage
432     selegiline   eldepryl   5 mg
433     selegiline   eldepryl   5 mg
434     selegiline   eldepryl   5 mg
435     selegiline   eldepryl   5 mg
436     selegiline   eldepryl   5 mg
437     selegiline   eldepryl   5 mg
438     selegiline   eldepryl   5 mg
439     selegiline   eldepryl   5 mg
440     selegiline   eldepryl   5 mg
441     selegiline   eldepryl   5 mg
442     selegiline   eldepryl   5 mg
443     selegiline   eldepryl   5 mg
444     selegiline   eldepryl   5 mg
445     selegiline   eldepryl   5 mg
446     selegiline   eldepryl   5 mg
447     selegiline   eldepryl   5 mg
6110    selegiline   eldepryl   5 mg
7594    selegiline   eldepryl   5 mg
7595    selegiline   eldepryl   5 mg
7596    selegiline   eldepryl   5 mg
13507   selegiline   eldepryl   5 mg
13508   selegiline   eldepryl   5 mg
13509   selegiline   eldepryl   5 mg
13510   selegiline   eldepryl   5 mg
13511   selegiline   eldepryl   5 mg
13512   selegiline   eldepryl   5 mg
1

Making sure the word "tablet" isn't present in the drug's generic name


In [50]:
for index, name in enumerate(generic_names): #This one refers to drug_exposure
    if 'tablet' in name:
        print(name, index)

carbidopa-levadopa 25-100 mg orally disintegrating tablet 9136
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9137
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9138
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9139
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9140
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9141
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9142
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9143
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9144
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9145
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9146
carbidopa-levadopa 25-100 mg orally disintegrating tablet 9147
rasagiline 1mg tablet- 10477
rasagiline 1mg tablet- 10478
selegiline tablet 5 mg 20843
selegiline tablet 5 mg 20844
selegiline tablet 5 mg 20845
selegiline tablet 5 mg 20846
selegiline tablet 5 mg 20847
selegiline tablet 5 mg 20848


Blindfully assuming that the medication taken by the patient was in fact eldepryl. TODO: Listen again to an audio and check for DPT (Dopamine replacement Therapy)

In [51]:
pd_data.loc[pd_data.index[13628:13640], 'brand_name'] = 'eldepryl'
# Replacing "selegiline tablet 5mg for just selegiline"
pd_data.loc[pd_data.index[13628:13640], 'generic_name'] = 'selegiline'

# pd_data.loc[pd_data.index[19504:19510], 'brand_name'] = 'eldepryl'
# Replacing "selegiline tablet 5mg for just selegiline"
# pd_data.loc[pd_data.index[19504:19510], 'generic_name'] = 'selegiline'

For "carbidopa-levodopa patient own med" we aren't sure about the medication (sinemet, sinemet cr, etc) or dosage, so I will leave it as is. Same for the dosage of "duopa"

In [52]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

                                    generic_name brand_name dosage
3333              carbidopa-levodopa patient own        NaN    NaN
3334              carbidopa-levodopa patient own        NaN    NaN
3335              carbidopa-levodopa patient own        NaN    NaN
7368  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7369  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7370  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7371  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7372  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7373  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7374  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7375  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7376  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7377  inv levodopa-carbidopa intestinal gel pump      duopa    NaN
7378  inv levodopa-carbidopa intestinal gel pump      duopa   

In [53]:
pd_data.loc[pd_data.index[13628:13640]]

Unnamed: 0,generic_name,brand_name,dosage
13628,selegiline,eldepryl,5 mg
13629,selegiline,eldepryl,5 mg
13630,selegiline,eldepryl,5 mg
13631,selegiline,eldepryl,5 mg
13632,selegiline,eldepryl,5 mg
13633,selegiline,eldepryl,5 mg
13634,selegiline,eldepryl,5 mg
13635,selegiline,eldepryl,5 mg
13636,selegiline,eldepryl,5 mg
13637,selegiline,eldepryl,5 mg


In [54]:
pd_data[pd_data['generic_name'] == 'selegiline']

Unnamed: 0,generic_name,brand_name,dosage
432,selegiline,eldepryl,5 mg
433,selegiline,eldepryl,5 mg
434,selegiline,eldepryl,5 mg
435,selegiline,eldepryl,5 mg
436,selegiline,eldepryl,5 mg
...,...,...,...
13635,selegiline,eldepryl,5 mg
13636,selegiline,eldepryl,5 mg
13637,selegiline,eldepryl,5 mg
13638,selegiline,eldepryl,5 mg


In [55]:
df_pd.insert(loc=3, column="generic_name", value=pd_data['generic_name'].to_list())
df_pd.insert(loc=4, column="brand_name", value=pd_data['brand_name'].to_list())
df_pd.insert(loc=5, column="dosage", value=pd_data['dosage'].to_list())

In [56]:
df_pd.sample(20)

Unnamed: 0,person_id,drug_source_value,drug_info,generic_name,brand_name,dosage,quantity,sig,route_source_value,dose_source_value,dose_unit_source_value,note_text
2051876,202019,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,carbidopa-levodopa,sinemet,25-100 mg,2.0,3 TIMES DAILY,PER G TUBE,2.0,tablet,"[**NAME**], MD, PhD [**DATE**] 10:50 AM De..."
1728510,216990,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,carbidopa-levodopa,sinemet,25-100 mg,1.0,3 TIMES DAILY,ORAL,1.0,tablet,"[**NAME**], MD [**DATE**] 1:54 PM Departm..."
1420796,220757,"{""med_display_name"": ""amantadine (SYMMETREL) c...",amantadine (SYMMETREL) capsule 100 mg,amantadine,symmetrel,100 mg,1.0,2 TIMES DAILY,ORAL,100.0,mg,"[**NAME**], MD [**DATE**] 12:50 PM Departm..."
92404,257075,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 12.5-125 MG per t...,carbidopa-levodopa,sinemet,12.5-125 mg,0.5,4 TIMES DAILY,ORAL,0.5,tablet,"""[**NAME**] RD [**DATE**] 3:32 PM Medica..."
2050378,202116,"{""med_display_name"": ""selegiline (ELDEPRYL) ta...",selegiline (ELDEPRYL) tablet 5 mg,selegiline,eldepryl,5 mg,0.0,EVERY MORNING BEFORE BREAKFAST,ORAL,0.0,mg,[**NAME**] [**DATE**] 9:56 AM Consult rec...
664222,256706,"{""med_display_name"": ""benztropine (COGENTIN) t...",benztropine (COGENTIN) tablet 0.5 mg,benztropine,cogentin,0.5 mg,1.0,2 TIMES DAILY,ORAL,0.5,mg,"[**NAME**], RN [**DATE**] 11:18 AM WOUND C..."
2277062,179838,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET CR) 50-200 MG per ...,carbidopa-levodopa,sinemet cr,50-200 mg,1.0,ONCE,ORAL,1.0,tablet,"""[**NAME**], MD [**DATE**] 5:20 PM Depart..."
770917,255046,"{""med_display_name"": ""benztropine (COGENTIN) t...",benztropine (COGENTIN) tablet 1 mg,benztropine,cogentin,1 mg,1.0,EVERY 6 HOURS PRN,ORAL,1.0,mg,"""[**NAME**], RD [**DATE**] 12:03 PM Medica..."
14600,260382,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET CR) 25-100 MG per ...,carbidopa-levodopa,sinemet cr,25-100 mg,2.0,NIGHTLY,ORAL,2.0,tablet,"""[**NAME**], MD [**DATE**] 1:21 PM Depart..."
867335,252725,"{""med_display_name"": ""carbidopa-levodopa (SINE...",carbidopa-levodopa (SINEMET) 25-100 MG per tab...,carbidopa-levodopa,sinemet,25-100 mg,1.0,3 TIMES DAILY,ORAL,1.0,tablet,"""[**NAME**], RD [**DATE**] 5:25 PM Medic..."


### Dealing with missing data for pd_data_drug_exposure 

In [57]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                                                    generic_name brand_name         dosage
3573                                       carbidopa-levodopa er        NaN   36.25-145 mg
3574                                       carbidopa-levodopa er        NaN   36.25-145 mg
3575                                       carbidopa-levodopa er        NaN   36.25-145 mg
3576                                       carbidopa-levodopa er        NaN   36.25-145 mg
3577                                       carbidopa-levodopa er        NaN   36.25-145 mg
3578                                       carbidopa-levodopa er        NaN   48.75-195 mg
3579                                       carbidopa-levodopa er        NaN   48.75-195 mg
3580                                       carbidopa-levodopa er        NaN   48.75-195 mg
3581                                       carbidopa-levodopa er        NaN   48.75-195 mg
3582                                       carbidopa-levodopa er        NaN   48.75-195 mg

For carbidopa-levodopa er (extended release) the dose strength definitely matches with that of Rytary; we will update the brand name as is

In [58]:
pd_data_drug_exposure['brand_name'].iloc[3573:3659] = 'rytary'

In [59]:
pd_data_drug_exposure[3573:3659]

Unnamed: 0,generic_name,brand_name,dosage
3573,carbidopa-levodopa er,rytary,36.25-145 mg
3574,carbidopa-levodopa er,rytary,36.25-145 mg
3575,carbidopa-levodopa er,rytary,36.25-145 mg
3576,carbidopa-levodopa er,rytary,36.25-145 mg
3577,carbidopa-levodopa er,rytary,36.25-145 mg
...,...,...,...
3654,carbidopa-levodopa er,rytary,36.25-145 mg
3655,carbidopa-levodopa er,rytary,36.25-145 mg
3656,carbidopa-levodopa er,rytary,36.25-145 mg
3657,carbidopa-levodopa er,rytary,48.75-195 mg


In [60]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                                                    generic_name brand_name         dosage
3962                              carbidopa-levodopa patient own        NaN            NaN
5994                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
5995                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
5996                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
5997                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
5998                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
5999                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
6000                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
6001                                          carbidopa-levodopa        NaN  4.63-20 mg/ml
6002                                          carbidopa-levodopa        NaN  4.63-20 mg/ml

Once we take a look at the original array we see that the regex couldn't catch the brand_name (duopa). We will update this manually

In [61]:
drugs_used_drug_exposure[5994:6009]

['**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidopa-levodopa 4.63-20 mg/ml (duopa) suspension** pt supplied med',
 '**carbidop

In [62]:
pd_data_drug_exposure['brand_name'].iloc[5994:6009] = 'duopa'

In [63]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                                                    generic_name brand_name     dosage
3962                              carbidopa-levodopa patient own        NaN        NaN
6475                                                  ropinirole  requip xl        NaN
6476                                                  ropinirole  requip xl        NaN
6477                                                  ropinirole  requip xl        NaN
6478                                                  ropinirole  requip xl        NaN
6479                                                  ropinirole  requip xl        NaN
6480                                                  ropinirole  requip xl        NaN
6481                                                  ropinirole  requip xl        NaN
6482                                                  ropinirole  requip xl        NaN
6658                                                  ropinirole  requip xl        NaN
6659                                       

Duopa is also used as an intestinal gel. We will update this columns accordingly

Duopa is presented most of the times as either 4.63-20mg/ml or 5-20mg/ml. I'm gonna blindfully assume that the conncentration is 5-20 

In [64]:
pd_data_drug_exposure['brand_name'].iloc[8137:8150] = 'duopa'
# If the drug is asctually duopa
pd_data_drug_exposure['brand_name'].iloc[8143:8150] = 'duopa'
pd_data_drug_exposure['dosage'].iloc[8143:8150] = '20-5mg/ml'

In [65]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                                                    generic_name brand_name     dosage
3962                              carbidopa-levodopa patient own        NaN        NaN
6475                                                  ropinirole  requip xl        NaN
6476                                                  ropinirole  requip xl        NaN
6477                                                  ropinirole  requip xl        NaN
6478                                                  ropinirole  requip xl        NaN
6479                                                  ropinirole  requip xl        NaN
6480                                                  ropinirole  requip xl        NaN
6481                                                  ropinirole  requip xl        NaN
6482                                                  ropinirole  requip xl        NaN
6658                                                  ropinirole  requip xl        NaN
6659                                       

Parcopa matches the description of orally disintegrating tablet with strength 25-100mg

In [66]:
# drugs_used_drug_exposure
pd_data_drug_exposure['brand_name'].iloc[9136:9148] = 'parcopa'

In [67]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                           generic_name brand_name   dosage
3962     carbidopa-levodopa patient own        NaN      NaN
6475                         ropinirole  requip xl      NaN
6476                         ropinirole  requip xl      NaN
6477                         ropinirole  requip xl      NaN
6478                         ropinirole  requip xl      NaN
6479                         ropinirole  requip xl      NaN
6480                         ropinirole  requip xl      NaN
6481                         ropinirole  requip xl      NaN
6482                         ropinirole  requip xl      NaN
6658                         ropinirole  requip xl      NaN
6659                         ropinirole  requip xl      NaN
6660                         ropinirole  requip xl      NaN
6661                         ropinirole  requip xl      NaN
6662                         ropinirole  requip xl      NaN
6663                         ropinirole  requip xl      NaN
8730                 nf selegiline 5 mg 

There's not enough information in the regex or the full drug description to know what is the asctua dosage or strength of Ropinirole. Would have to look deeper into notes

In [68]:
drugs_used_drug_exposure[6475:6482+1]

['ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet',
 'ropinirole (requip xl) tb24 1 tablet']

In [69]:
drugs_used_drug_exposure[6658:6663+1]

['*ropinirole (requip xl) 24 hr tablet patient supplied',
 '*ropinirole (requip xl) 24 hr tablet patient supplied',
 '*ropinirole (requip xl) 24 hr tablet patient supplied',
 '*ropinirole (requip xl) 24 hr tablet patient supplied',
 '*ropinirole (requip xl) 24 hr tablet patient supplied',
 '*ropinirole (requip xl) 24 hr tablet patient supplied']

Dealing with selegiline

In [70]:
selegiline = pd_data_drug_exposure[(pd_data_drug_exposure['brand_name'] == 'eldepryl')| (pd_data['brand_name'] == 'zelapar')]
selegiline['brand_name'].unique()

array(['eldepryl'], dtype=object)

In [71]:
pd_data_drug_exposure['generic_name'].iloc[20843:20849] = 'selegiline'
pd_data_drug_exposure['brand_name'].iloc[20843:20849] = 'eldepryl'
pd_data_drug_exposure['generic_name'].iloc[8730] = 'selegiline'
pd_data_drug_exposure['brand_name'].iloc[8730] = 'eldepryl'

Dealing with Rasagiline

In [72]:
pd_data_drug_exposure['generic_name'].iloc[10477:10479] = 'rasagiline'
pd_data_drug_exposure['brand_name'].iloc[10477:10479] = 'azilect'

In [117]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                           generic_name brand_name   dosage
3962     carbidopa-levodopa patient own        NaN      NaN
6475                         ropinirole  requip xl      NaN
6476                         ropinirole  requip xl      NaN
6477                         ropinirole  requip xl      NaN
6478                         ropinirole  requip xl      NaN
6479                         ropinirole  requip xl      NaN
6480                         ropinirole  requip xl      NaN
6481                         ropinirole  requip xl      NaN
6482                         ropinirole  requip xl      NaN
6658                         ropinirole  requip xl      NaN
6659                         ropinirole  requip xl      NaN
6660                         ropinirole  requip xl      NaN
6661                         ropinirole  requip xl      NaN
6662                         ropinirole  requip xl      NaN
6663                         ropinirole  requip xl      NaN
10449  pramipexole dihydrochloride tb24 

Figuring what's going on with Paramipexole AKA Mirapex

In [74]:
drug_exposure.iloc[10449]

person_id                                                                  222876
drug_exposure_start_datetime                                  2019-05-21 21:22:00
drug_source_value               {"med_display_name": "Pramipexole Dihydrochlor...
dose_source_value                                                             1.5
dose_unit_source_value                                                         mg
route_source_value                                                           ORAL
visit_occurrence_id                                                      34186201
visit_detail_id                                                         8187411.0
Name: 10449, dtype: object

In [75]:
drug_exposure['drug_source_value'].iloc[10449]

'{"med_display_name": "Pramipexole Dihydrochloride TB24 1.5 mg", "rxnorm_concat": "| 901550 |", "med_dose_unit_desc": "mg", "mar_action": "GIVEN", "med_order_desc": "PRAMIPEXOLE DIHYDROCHLORIDE ER 1.5 MG PO TB24"}'

In [76]:
drug_exposure['person_id'].iloc[10449]

222876

In [130]:
pd_data_drug_exposure[(pd_data_drug_exposure['generic_name'] == 'pramipexole')]

Unnamed: 0,generic_name,brand_name,dosage
933,pramipexole,mirapex,1 mg
934,pramipexole,mirapex,1 mg
935,pramipexole,mirapex,1 mg
936,pramipexole,mirapex,1 mg
1273,pramipexole,mirapex,0.5 mg
...,...,...,...
23456,pramipexole,mirapex,0.25 mg
23457,pramipexole,mirapex,0.25 mg
23458,pramipexole,mirapex,0.25 mg
26721,pramipexole,mirapex,1 mg


In [132]:
pd_data_drug_exposure[(pd_data_drug_exposure['generic_name'] == 'pramipexole') & (pd_data_drug_exposure['dosage'] == '1 mg')]

Unnamed: 0,generic_name,brand_name,dosage
933,pramipexole,mirapex,1 mg
934,pramipexole,mirapex,1 mg
935,pramipexole,mirapex,1 mg
936,pramipexole,mirapex,1 mg
1585,pramipexole,mirapex,1 mg
...,...,...,...
18072,pramipexole,mirapex,1 mg
18073,pramipexole,mirapex,1 mg
18074,pramipexole,mirapex,1 mg
26721,pramipexole,mirapex,1 mg


In [133]:
drug_exposure.iloc[933]

person_id                                                                  260872
drug_exposure_start_datetime                                  2021-04-05 22:20:00
drug_source_value               {"med_display_name": "pramipexole (MIRAPEX) ta...
dose_source_value                                                             1.0
dose_unit_source_value                                                         mg
route_source_value                                                           ORAL
visit_occurrence_id                                                      40421749
visit_detail_id                                                         9879425.0
Name: 933, dtype: object

In [134]:
drug_exposure[drug_exposure['person_id'] == 260872]['drug_source_value'].unique()

array(['{"med_display_name": "carbidopa-levodopa (SINEMET) 25-250 MG per tablet 1 tablet", "rxnorm_concat": "| 197445 |", "med_dose_unit_desc": "tablet", "mar_action": "GIVEN", "med_order_desc": "CARBIDOPA-LEVODOPA 25-250 MG PO TABS"}',
       '{"med_display_name": "pramipexole (MIRAPEX) tablet 1 mg", "rxnorm_concat": "| 859044 |", "med_dose_unit_desc": "mg", "mar_action": "GIVEN", "med_order_desc": "PRAMIPEXOLE DIHYDROCHLORIDE 0.5 MG PO TABS"}'],
      dtype=object)

In [143]:
# Patient doesn't reach maximum dose a day (4.5  mg/day)
# MD pramipexole (MIRAPEX) 1 MG PO Tablet Take 1 tablet by mouth 4 times daily. Take at 5am-10 am -3 pm and 8 pm change in dose
df_pd[df_pd['person_id'] == 260872]['note_text'].iloc[2]

'"[**NAME**], MD     [**DATE**] 12:39 AM This consult is :In person Consult Note Department of Neurology Admit Date: [**DATE**]  LOS: 0 days PCP: [**NAME**], MD Subjective Reason for Request:  ""Rule out Dementia"" History of Present Illness: [**NAME**] is a 72 y.o. male with a PMH of idiopathic Parkinson\'s disease, REM behavioral disturbance, CAD status post PCI on Plavix who presented to the hospital on [**DATE**] for syncope and altered mental status. This consultation was requested by Hospitalist. History obtained from patient, chart. Patient was found passed out in his front steps of his home where he lives with his son.  There were no signs of trauma.  EMS was called due to concern for syncope.  Upon awakening patient was noted to be combative brought in for evaluation.  He is evaluated with his long-time female friend at bedside.  He appears at baseline and is cooperative. For his female friend, patient is able to care for himself including bathing, cooking, eating, financial m

In [121]:
drug_exposure[drug_exposure['person_id'] == 222876]['drug_source_value'].unique()

array(['{"med_display_name": "carbidopa-levodopa (SINEMET) 25-100 MG per tablet 1.5 tablet", "rxnorm_concat": "| 197444 |", "med_dose_unit_desc": "tablet", "mar_action": "GIVEN BY OTHER", "med_order_desc": "CARBIDOPA-LEVODOPA 25-100 MG PO TABS"}',
       '{"med_display_name": "carbidopa-levodopa (SINEMET) 25-100 MG per tablet 1.5 tablet", "rxnorm_concat": "| 197444 |", "med_dose_unit_desc": "tablet", "mar_action": "GIVEN", "med_order_desc": "CARBIDOPA-LEVODOPA 25-100 MG PO TABS"}',
       '{"med_display_name": "carbidopa-levodopa (SINEMET) 25-100 MG per tablet 1.5 tablet", "rxnorm_concat": "| 197444 |", "med_dose_unit_desc": "tablet", "mar_action": "GIVEN-1ST DOSE EDUCATION", "med_order_desc": "CARBIDOPA-LEVODOPA 25-100 MG PO TABS"}',
       '{"med_display_name": "carbidopa-levodopa (SINEMET) 25-100 MG per tablet 1.5 tablet", "rxnorm_concat": "| 197444 |", "med_dose_unit_desc": "tablet", "mar_action": "HELD", "med_order_desc": "CARBIDOPA-LEVODOPA 25-100 MG PO TABS"}',
       '{"med_dis

In [120]:
dani = drug_exposure[drug_exposure['person_id'] == 222876]['drug_source_value'].unique()[4]

In [112]:
dani

'{"med_display_name": "pramipexole (MIRAPEX) tablet 0.5 mg", "rxnorm_concat": "| 859044 |", "med_dose_unit_desc": "mg", "mar_action": "GIVEN", "med_order_desc": "PRAMIPEXOLE DIHYDROCHLORIDE 0.5 MG PO TABS"}'

In [109]:
pomar = drug_exposure['drug_source_value'].iloc[0]

In [111]:
len(drug_exposure)

26946

In [116]:
drug_exposure[drug_exposure['drug_source_value'] == dani]['drug_exposure_start_datetime'].unique()

array(['2016-08-23 13:58:00', '2016-08-23 20:38:00',
       '2016-08-24 08:03:00', '2016-08-24 13:44:00',
       '2016-08-24 21:11:00', '2016-08-25 08:37:00',
       '2016-08-25 14:46:00', '2016-08-25 20:02:00',
       '2016-08-26 08:38:00', '2016-08-26 14:23:00',
       '2016-08-26 21:20:00', '2016-08-27 08:00:00',
       '2016-08-27 14:22:00', '2016-08-27 20:12:00',
       '2016-08-28 08:44:00', '2016-08-28 15:00:00',
       '2016-08-28 20:13:00', '2016-08-30 08:24:00',
       '2016-08-30 15:09:00', '2016-08-30 20:44:00',
       '2016-08-31 08:19:00', '2016-08-31 13:17:00',
       '2016-08-31 20:26:00', '2016-09-01 08:16:00',
       '2016-09-01 13:03:00', '2016-09-01 23:11:00',
       '2016-09-02 08:54:00', '2016-09-02 13:38:00',
       '2016-09-02 20:23:00', '2016-09-03 08:19:00',
       '2016-09-03 13:10:00', '2019-03-15 16:14:00',
       '2019-03-15 22:13:00', '2019-03-16 08:39:00',
       '2019-03-16 13:28:00', '2019-03-16 20:46:00',
       '2019-03-17 08:42:00', '2019-03-17 15:1

In [152]:
df_pd[df_pd['brand_name'] == 'mirapex']['dosage'].unique()

array(['1 mg', '0.5 mg', '1.5 mg', '0.25 mg', '0.125 mg', '0.75 mg'],
      dtype=object)

In [153]:
pd_data_drug_exposure['brand_name'].iloc[10449:10477] = 'mirapex'

In [192]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                           generic_name brand_name dosage
3962     carbidopa-levodopa patient own        NaN    NaN
6475                         ropinirole  requip xl    NaN
6476                         ropinirole  requip xl    NaN
6477                         ropinirole  requip xl    NaN
6478                         ropinirole  requip xl    NaN
6479                         ropinirole  requip xl    NaN
6480                         ropinirole  requip xl    NaN
6481                         ropinirole  requip xl    NaN
6482                         ropinirole  requip xl    NaN
6658                         ropinirole  requip xl    NaN
6659                         ropinirole  requip xl    NaN
6660                         ropinirole  requip xl    NaN
6661                         ropinirole  requip xl    NaN
6662                         ropinirole  requip xl    NaN
6663                         ropinirole  requip xl    NaN
10454  pramipexole dihydrochloride tb24    mirapex    NaN
10455  pramipe

In [191]:
drug_exposure.iloc[10454:10460+1]['drug_source_value'].iloc[0]

'{"med_display_name": "Pramipexole Dihydrochloride TB24 1.5 tablet", "rxnorm_concat": "| 901550 |", "med_dose_unit_desc": "tablet", "mar_action": "GIVEN", "med_order_desc": "PRAMIPEXOLE DIHYDROCHLORIDE ER 1.5 MG PO TB24"}'

In [161]:
pd_data_drug_exposure[pd_data_drug_exposure['brand_name'] == 'mirapex']['dosage'].unique()

array(['1 mg', '0.5 mg', '1.5 mg', '0.25 mg', '0.125 mg', nan, '2.25 mg',
       '0.75 mg'], dtype=object)

Rxnorm 901550 is actually 24 HR pramipexole dihydrochloride 1.5 MG Extended Release Oral Tablet ...

In [193]:
pd_data_drug_exposure['dosage'].iloc[10454:10460+1] = '1.5 mg'

In [194]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

                        generic_name brand_name dosage
3962  carbidopa-levodopa patient own        NaN    NaN
6475                      ropinirole  requip xl    NaN
6476                      ropinirole  requip xl    NaN
6477                      ropinirole  requip xl    NaN
6478                      ropinirole  requip xl    NaN
6479                      ropinirole  requip xl    NaN
6480                      ropinirole  requip xl    NaN
6481                      ropinirole  requip xl    NaN
6482                      ropinirole  requip xl    NaN
6658                      ropinirole  requip xl    NaN
6659                      ropinirole  requip xl    NaN
6660                      ropinirole  requip xl    NaN
6661                      ropinirole  requip xl    NaN
6662                      ropinirole  requip xl    NaN
6663                      ropinirole  requip xl    NaN


## TODO: Find out missing dosages for ropinirole and LC

Using the original drug_exposure to fill in the blanks ...

In [158]:
drug_exposure['person_id'].iloc[10455]

222876

In [160]:
df_pd[df_pd['person_id'] == 222876]['note_text'].iloc[0]

'"[**NAME**], MD     [**DATE**]  9:18 AM [**LOCATION_INSTITUTE**]  Palliative and Supportive Care   Initial Consult Requested by: [**NAME**]* Reason for consult:Goals of Care and Symptom Management - Recommendations Palliative Diagnosis: metastatic squamous cell carcinoma HPI:  [**NAME**] is a 73 y.o.male male with metastatic squamous cell carcinoma with mets to right parotid gland s/p parotidectomy and radiation and recent POD with lymphangitic spread to lung and bilateral pleural effusions, presumed sarcoidosis, parkinsons disease, h/o TIA, T8 vertebral fracture, anemia who is admitted [**DATE**]  8:52 AM with new acute PE and worsening metastatic lung disease. Per chart review and discussion with patient, he initially presented from Oak Hammock for worsening chest pain, chills, and respiratory symptoms. CTA showed PE and CT chest revealed worsening lymphangitic spread in lungs with bilateral pleural effusions. Yesterday the patient became more hypoxic even while on non-rebreather. B

Both requil xl and duopa have different strengths. We will skip them for now

### Creating the corpus from medical notes 

#### Using the note_text column for raw data

In [None]:
# corpus_raw = list(df_pd["note_text"])

#### Removing words that don't contain much meaning from our notes

In [None]:
words_to_remove = ["Department of Neurosurgery Date of Consult", "Department of Orthopedics Consultation Note Date of Consult", "Geriatric Medicine Consult Date of Consult", "INPATIENT MEDICAL NUTRITION THERAPY", "MSW", "RN" ,"evidence", "Read By", "images", "report", "concur", "findings", "agree", "seen", "residents", "resident", "Resident", "unspecified provider", "Released Date Time", "personally reviewed" ,"D.O", "MD", "M.D.", "Electronically Verified By", "NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]
# words_to_remove = ["NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]

# corpus_clean = []
# for item in corpus_raw:
    # for word in words_to_remove:
        # item = item.replace(word, '') 
    # corpus_clean.append(item)

#### Taking each word from the cleaned corpus and making it lowercase

In [None]:
# corpus = [word.lower() for word in corpus_clean]
# corpus[989]

#### Adding the pre-processed version of the notes to the DataFrame

In [None]:
# df_pd.loc[:, "note_text"] = corpus
# df_pd

Looking for ocurrences of the word "updrs"or "motor scale" in the database

In [None]:
# keywords = ['updrs', 'motor scale']
# indexes = []
# for i, item in enumerate(corpus):
    # for word in keywords:
        # if word in item:
            # indexes.append(i)    

In [None]:
# indexes

Filtering out those rows

In [None]:
# df_pd.iloc[indexes]

Only in 4 patients there's mention of the updrs or motor scale 

In [None]:
# df_pd.iloc[indexes]['person_id'].unique()

Seeing info regarding rows where updrs and motor scale are mentioned

In [None]:
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

In [None]:
# TODO: Come back here and read the whole thing
# df_pd.iloc[indexes]['note_text']

In [None]:
demographics.sample(5)

In [None]:
demographics_all.sample(5)

Calculating the age of the patients

In [None]:
# Converting the "birth_datetime" column to a datetime format 
demographics["birth_datetime"] = pd.to_datetime(demographics["birth_datetime"], format='%Y-%m-%d')
demographics_all["birth_datetime"] = pd.to_datetime(demographics_all["birth_datetime"], format='%Y-%m-%d')

# Calculating the age by substracting the current date with the date in the dataframe
demographics["age"] = (pd.Timestamp('now') - demographics["birth_datetime"]).astype('timedelta64[Y]')
demographics_all["age"] = (pd.Timestamp('now') - demographics_all["birth_datetime"]).astype('timedelta64[Y]')

# Converting age to integer
demographics["age"]  = pd.to_numeric(demographics["age"], downcast='integer')
demographics_all["age"]  = pd.to_numeric(demographics_all["age"], downcast='integer')

# Dropping the birth_datetime (it's useless at this point) 
demographics = demographics.drop("birth_datetime", axis=1)
demographics_all = demographics_all.drop("birth_datetime", axis=1)

# Re-arranging the order of the columns
demographics = demographics[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value']]
demographics_all = demographics_all[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value']]

demographics

In [None]:
demographics.sample(5)

In [None]:
demographics_all.sample(5)

In [None]:
df_pd

We merge df_pd with demographics based in the person_id. This is because, initially, df_pd only contains drug information, not patient information

In [None]:
df_pd = df_pd.merge(demographics, on='person_id', how='left')
df_pd = df_pd[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'drug_source_value', 'drug_info', 'generic_name', 'brand_name', 'dosage', 'quantity', 'sig', 'route_source_value', 'dose_source_value', 'dose_unit_source_value', 'note_text' ]]

### Some statistics

In [None]:
df_pd['age'].describe()

In [None]:
demographics_all['age'].describe()

In [None]:
pd_data.describe()

In [None]:
demographics.columns.values

## Plots and statistics

In [None]:
demographics_all[demographics_all['gender_source_value'] == 'UNKNOWN']

In [None]:
demographics_all['gender_source_value'].value_counts(normalize=True)

In [None]:
demographics['gender_source_value'].value_counts(normalize=True)

In [None]:
# Plotting the distribution of gender for PD patients vs. all patients
gender_counts = demographics['gender_source_value'].value_counts(normalize=True) * 100
gender_counts_all = demographics_all['gender_source_value'].value_counts(normalize=True)[:2] * 100 # Leaves out 'UNKNOWN' only 1 patient

# Combine percentages into a single DataFrame
combined_percentages = pd.DataFrame({
    "Parkinson's Patients": gender_counts,
    "All Patients": gender_counts_all
}).fillna(0)  # Fill NaN values with 0 in case some genders are missing in either group

# Plotting the percentage distribution of gender for both groups side by side
fig, ax = plt.subplots(figsize=(10, 6))

# Color map for the bar chart
colors = plt.cm.tab20.colors

# Calculate the width for each bar group
bar_width = 0.35

# Create the x positions for the bars
x = range(len(combined_percentages.index))

# Plot bars for Parkinson's patients
parkinsons_bars = ax.bar(x, combined_percentages["Parkinson's Patients"], width=bar_width, label="Parkinson's Patients", color=colors[0])

# Plot bars for all patients (shifted by bar_width)
all_patients_bars = ax.bar([i + bar_width for i in x], combined_percentages["All Patients"], width=bar_width, label="All Patients", color=colors[1])

# Set plot title and labels
ax.set_title("Distribution of Gender: Parkinson's Patients vs All Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Gender', fontsize=12)
ax.set_ylabel('Percentage', fontsize=12)
ax.legend(fontsize=12)

# Set the x-axis ticks to be at the center of the bars
ax.set_xticks([i + bar_width / 2 for i in x])
ax.set_xticklabels(combined_percentages.index, fontsize=12)

# Add the percentage values inside each bar
for bar1, bar2 in zip(parkinsons_bars, all_patients_bars):
    height1 = bar1.get_height()
    height2 = bar2.get_height()
    ax.text(bar1.get_x() + bar1.get_width() / 2, height1 / 2, f"{height1:.1f}%", ha='center', va='center', fontsize=12)
    ax.text(bar2.get_x() + bar2.get_width() / 2, height1 / 2, f"{height2:.1f}%", ha='center', va='center', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
demographics['gender_source_value'].value_counts(normalize=False) 

In [None]:
demographics_all['gender_source_value'].value_counts(normalize=False) 

In [None]:
# Sample data (replace this with your actual data from the DataFrame)
race_counts = demographics['race_source_value'].value_counts(normalize=True) * 100

# Create a DataFrame from the data
race_df = pd.DataFrame({'Percentage': race_counts})

# Sort the DataFrame by percentage in descending order
race_df = race_df.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = race_df['Percentage'].plot(kind='barh', figsize=(16, 6), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title('Race Distribution', fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12)
plt.ylabel('Race', fontsize=12)

# Add labels to the right of each bar
for i, v in enumerate(race_df['Percentage']):
    plt.text(v + 1, i, f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels
plt.yticks([])
plt.xticks(fontsize=12)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, race_counts.index)]

# Show the custom legend
plt.legend(handles=legend_patches, loc='best')

plt.tight_layout()
plt.show()

In [None]:
demographics['ethnicity_source_value'].value_counts()

In [None]:
race_counts

In [None]:
# Calculate the total number of patients in each dataset
total_patients_df = len(df['person_id'].unique())
total_patients_df_pd = len(df_pd['person_id'].unique())
total_patients_demographics_all = len(demographics_all['person_id'].unique())

# Calculate the percentage of PD patients from the whole cohort and PD patients on medication
percentage_pd_patients_df = (total_patients_df / total_patients_demographics_all) * 100
percentage_pd_patients = (total_patients_df_pd / total_patients_demographics_all) * 100

# Create the data for the horizontal bar plot
categories = ['PD Patients from\nWhole Cohort', 'PD Patients on\nMedication']
values = [percentage_pd_patients_df, percentage_pd_patients]

# Set the figure size for the horizontal bar plot
plt.figure(figsize=(14, 6))

# Plot the horizontal bar plot
plt.barh(categories, values, color=['blue', 'green'])
plt.xlabel('Percentage of Patients', fontsize=12.5)
plt.title('Percentage of PD Patients taking in-hospital medication vs. PD Patients overall (relative to total number of all patients)', fontsize=15, fontweight='bold')
plt.ylabel('')
plt.xticks([])

# Display the percentage values inside the bars
for i, v in enumerate(values):
    plt.text(v + 0.0005, i, f"{v:.2f}%", ha='left', va='center', fontsize=12.5)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
percentage_pd_patients_df

In [None]:
# Plotting value counts of race with percentages
ethnicity = demographics['ethnicity_source_value'].value_counts()
plt.figure(figsize=(12, 8))  # Adjust the values as per your desired size
labels = ethnicity.index
colors = plt.cm.tab20(np.arange(len(labels))) 
ax = ethnicity.plot(kind='bar', color=colors, edgecolor='black', linewidth=0.5)

# Calculate the percentages
total_count = ethnicity.sum()
percentages = [(count / total_count) * 100 for count in ethnicity]

# Display the percentages inside the bars
ax.bar_label(ax.containers[0], labels=[f"{percentage:.1f}%" for percentage in percentages], label_type='center', fontsize=12.5)

plt.title('Distribution of Ethnicity', fontsize=15, fontweight='bold')
plt.xlabel('Ethnicity')
plt.ylabel('Count')
plt.xticks(range(len(labels)), labels, rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()

Actual actual way to deal with this lol

In [None]:
# Plotting the distribution of age for both groups using a box plot
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the box plot
ax.boxplot([demographics['age'], demographics_all['age']], labels=["Parkinson's Patients", "All Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Parkinson's Patients and All Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
male_pd_patients_by_age = demographics[demographics['gender_source_value'] == 'MALE']['age']
male_pd_patients_by_age_all = demographics_all[demographics_all['gender_source_value'] == 'MALE']['age']# Plotting the distribution of age for both groups using a box plot

fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age, male_pd_patients_by_age_all], labels=["Male Parkinson's Patients", "All Male Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male Parkinson's Patients and All Male Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
female_pd_patients_by_age = demographics[demographics['gender_source_value'] == 'FEMALE']['age']
female_pd_patients_by_age_all = demographics_all[demographics_all['gender_source_value'] == 'FEMALE']['age']

fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([female_pd_patients_by_age, female_pd_patients_by_age_all], labels=["Female Parkinson's Patients", "All Female Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Female Parkinson's Patients and All Female Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age, female_pd_patients_by_age], labels=["Male Parkinson's Patients", "Female Parkinson's Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male and Female Parkinson's Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age_all, female_pd_patients_by_age_all], labels=["All Male Patients", "All Female Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male and Female Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
pd_data.value_counts(normalize=True)

In [None]:
pd_data['brand_name'].value_counts(normalize=True)

## Distribution of drugs

In [None]:
drug_distribution_count = df_pd['generic_name'].value_counts(normalize=True) * 100
drug_distribution = pd.DataFrame({'Percentage': drug_distribution_count})

# Sort the DataFrame by percentage in descending order
drug_distribution = drug_distribution.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = drug_distribution['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Distribution of medication (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(drug_distribution['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.2f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(drug_distribution)), drug_distribution.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, drug_distribution.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()


In [None]:
len(pd_data) == len(df_pd)

In [None]:
df_pd.iloc[5187]

In [None]:
pd_data.iloc[5187]

In [None]:
len(pd_data[pd_data['brand_name'] == 'duopa']) == len(df_pd[df_pd['brand_name'] == 'duopa'])

In [None]:
# vlalue_counts: Number of times each unique elements appears
medication_data = pd_data['brand_name'].value_counts(normalize=True)
medication_data.plot(kind='barh')

In [None]:
df_pd.groupby('brand_name')['person_id'].nunique()

In [None]:
df_pd['brand_name'].nunique()

In [None]:
df_pd.columns

In [None]:
# Drug usage per patient
drug_percentage = df_pd.groupby('race_source_value')['person_id'].nunique()/ df_pd['person_id'].nunique()
drug_percentage.sort_values(ascending=False)
# drug_percentage

In [None]:
drug_percentage = df_pd.groupby('generic_name')['person_id'].nunique()/ df_pd['person_id'].nunique()
drug_popularity_count = drug_percentage * 100
drug_popularity = pd.DataFrame({'Percentage': drug_popularity_count})

# Sort the DataFrame by percentage in descending order
drug_popularity = drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Parkison's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(drug_popularity)), drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()


In [None]:
# Assuming df_pd is a DataFrame containing drug data and 'gender_source_value' is a column representing gender

# Filter the DataFrame for 'gender_source_value' == 'MALE'
male_pd = df_pd[df_pd['gender_source_value'] == 'MALE']

# Calculate the drug percentage for males
male_drug_percentage = male_pd.groupby('generic_name')['person_id'].nunique() / male_pd['person_id'].nunique()

# Calculate the percentage popularity of each drug for males
male_drug_popularity_count = male_drug_percentage * 100
male_drug_popularity = pd.DataFrame({'Percentage': male_drug_popularity_count})

# Sort the DataFrame by percentage in descending order
male_drug_popularity = male_drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = male_drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Male Parkinson's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(male_drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(male_drug_popularity)), male_drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, male_drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()


In [None]:
# Assuming df_pd is a DataFrame containing drug data and 'gender_source_value' is a column representing gender

# Filter the DataFrame for 'gender_source_value' == 'MALE'
female_pd = df_pd[df_pd['gender_source_value'] == 'FEMALE']

# Calculate the drug percentage for males
female_drug_percentage = female_pd.groupby('generic_name')['person_id'].nunique() / female_pd['person_id'].nunique()

# Calculate the percentage popularity of each drug for males
female_drug_popularity_count = female_drug_percentage * 100
female_drug_popularity = pd.DataFrame({'Percentage': female_drug_popularity_count})

# Sort the DataFrame by percentage in descending order
female_drug_popularity = female_drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = female_drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Female Parkinson's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(female_drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(female_drug_popularity)), female_drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, female_drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()

In [None]:
female_drug_popularity['Percentage']

In [None]:
# Assuming you already have these variables as pandas Series
# male_drug_popularity and female_drug_popularity with 'Percentage' as the data and medication names as the index.

# Combine both male and female data for sorting
combined_data = pd.concat([male_drug_popularity, female_drug_popularity], axis=1)
combined_data.columns = ['Males', 'Females']

# Sort the medications based on the sum of percentages (sorting from higher to lower)
sorted_combined = combined_data.sum(axis=1).sort_values(ascending=False)

# Plotting the data as a horizontal bar plot
plt.figure(figsize=(16, 8))  # Adjust the figure size as needed

# Increase the width of the bars and adjust figure size for better spacing
width = 0.4

# Bar plot for males
ind_male = range(len(sorted_combined))
bars_male = plt.barh(ind_male, combined_data.loc[sorted_combined.index, 'Males'], height=width, label='Males', color='tab:gray')

# Bar plot for females
ind_female = [i + width for i in ind_male]
bars_female = plt.barh(ind_female, combined_data.loc[sorted_combined.index, 'Females'], height=width, label='Females', color='tab:pink')

# Adding percentages to the side of the bars
for i, val in enumerate(combined_data.loc[sorted_combined.index, 'Males']):
    plt.text(val, i, f'{val:.2f}%', va='center', fontsize=10.5, color='black')

for i, val in enumerate(combined_data.loc[sorted_combined.index, 'Females']):
    plt.text(val, i + width, f'{val:.2f}%', va='center', fontsize=10.5, color='black')

# Adding labels, titles, and customizing the plot
plt.xlabel('Percentage')
plt.ylabel('Medication')
plt.title('Prescribed medication for Male vs. Female Parkinson\'s patients', fontsize=15, fontweight='bold')
plt.yticks([i + width / 2 for i in ind_male], sorted_combined.index, fontsize=10)
plt.gca().invert_yaxis()
plt.legend(loc='lower right')  # Move the legend to upper right for better spacing
plt.tight_layout()
plt.show()


In [None]:
df_pd['generic_name'].value_counts(normalize=True) * 100

In [None]:
drug_percentage

## Calculating LED (Levodopa Dose Equivalent) for Parkinson's patients

In [None]:
# sig stablished how the medication should be taken by the patient
formatted_series = df_pd['dose_source_value'].unique()
format_func = np.vectorize(lambda x: '{:.4}'.format(x))
bla = np.sort(format_func(formatted_series))
bla

In [None]:
led_df = pd.concat([df_pd['person_id'], df_pd.iloc[:, 5:-1]], axis=1)

In [None]:
drug_exposure

In [None]:
led_df

In [None]:
led_df = led_df.drop(columns=['drug_source_value', 'drug_info', 'quantity', 'sig'])

The reason we have duplicates is because of the note_text column. Because it contains two notes for that patient it will duplicate the values. However, to calculate the LED that information is not necessary

In [None]:
df_pd[df_pd['person_id'] == 261006]

In [None]:
led_df

In [None]:
led_df.duplicated(subset=['generic_name'])

In [None]:
led_df[led_df.duplicated(subset=['generic_name'])]

In [None]:
drug_exposure[drug_exposure.duplicated(subset=['person_id', 'drug_exposure_start_datetime']) == False]

In [None]:
drug_exposure.duplicated(subset=['person_id', 'drug_exposure_start_datetime'], keep=False).head(30)

In [None]:
lunch = drug_exposure.groupby(['person_id', 'drug_exposure_start_datetime'])

In [None]:
drug_exposure[drug_exposure.duplicated(subset=['drug_source_value'], keep=False) == True]

In [None]:
dumb = drug_exposure.duplicated(subset=['drug_exposure_start_datetime'], keep=False)
print(drug_exposure[dumb])

The original len is the one of df_pd. The new len corresponds to the unique person_id values

In [None]:
print('led_df Len before ' + str(len(led_df)))
print(led_df['generic_name'].unique())
# led_df = led_df.drop_duplicates(subset=['person_id'])
# print('led_df Len after ' + str(len(led_df)))
print(df_pd['generic_name'].unique())

In [None]:
led_df[led_df['person_id'] == 261006]

After that, we remove some more duplicated from drug_exposure that involve both person_id and drug_exposure_start_datetime. We then merge the two datasets

In [None]:
print('drug_exposure Len before ' + str(len(drug_exposure)))
drug_exposure = drug_exposure.drop_duplicates(subset=['person_id', 'drug_exposure_start_datetime'])
print('drug_exposure Len after ' + str(len(drug_exposure)))

In [None]:
led_df = pd.merge(led_df, drug_exposure, on=['person_id'], how='inner')

In [None]:
led_df[led_df['person_id'] == 261006]

In [None]:
led_df = led_df.drop_duplicates(subset=['drug_exposure_start_datetime'])

In [None]:
led_df

In [None]:
len(drug_exposure)

In [None]:
len(led_df)

In [None]:
led_df.loc[led_df['dose_source_value'] == 0.0, ['person_id', 'drug_info', 'quantity']]['quantity'].unique()

In [None]:
df_pd[df_pd['dose_source_value'] == 0.0]['drug_source_value'][14]

In [None]:
drug_exposure.iloc[38]

In [None]:
led_df['generic_name'].unique()

In [None]:
led_df[led_df['dose_source_value'] == 0.0]['drug_source_value']

In [None]:
json.loads(led_df['drug_source_value'].iloc[0])['mar_action']

Esta bieeeeeeeeeeeeeeeeeeen y funciona

In [None]:
dl = [json.loads(x) for x in led_df[led_df['dose_source_value'] == 0.0]['drug_source_value']]
for index, tem in enumerate(dl):
    print(index, led_df['person_id'].iloc[index], dl[index]['mar_action'], led_df['drug_source_value'].iloc[index])
len(dl)

In [None]:
type(led_df[led_df['dose_source_value'] == 0.0]['drug_source_value'])

In [None]:
type(led_df.loc[led_df['dose_source_value'] == 0.0, ['drug_source_value']])

In [None]:
bla = [json.loads(x) for x in led_df[led_df['quantity'] == 0.0]['drug_source_value']]
for index, item in enumerate(bla):
    print(bla[index]['mar_action'])

We lose 18.9% percent of patients because their medication was held

In [None]:
print(len(dl)*100/len(df_pd))

In [None]:
led_df['dosage'].unique()

In [None]:
led_df_dosage = led_df['dosage'].to_list()
led_dosage = []
for index, item in enumerate(led_df_dosage):
    if '-' in item:
        dosage = float(item.split('-')[1].split()[0])
    else:
        dosage = float(item.split()[0])
    led_dosage.append(dosage)

In [None]:
len(led_dosage)

In [None]:
led_df['dose_source_value'].unique()

In [None]:
led_df.insert(loc=6, column="dose_for_led", value=led_dosage)

In [None]:
led_df = led_df[['person_id', 'drug_exposure_start_datetime', 'drug_source_value', 'drug_info', 'generic_name', 'brand_name', 'dosage', 'dose_for_led', 'quantity', 'sig', 'route_source_value', 'dose_source_value', 'dose_unit_source_value', 'visit_occurrence_id', 'visit_detail_id']]

In [None]:
led_df[led_df['dose_for_led'] == led_df['dose_source_value']]['dose_unit_source_value'].unique()

In [None]:
led_df

Why did I lose so many drugs?

In [None]:
led_df['generic_name'].unique()

In [None]:
pd_data['generic_name'].unique()

In [None]:
import pandas as pd

# Sample conversion factor (example, actual values may differ)
conversion_factors = {
    'levodopa': 1.0,
    'pramipexole': 0.25,
    'ropinirole': 0.5,
    # Add more drugs and their conversion factors
}

# Assuming led_df is your DataFrame
# Convert 'drug_exposure_start_datetime' to datetime format
led_df['drug_exposure_start_datetime'] = pd.to_datetime(led_df['drug_exposure_start_datetime'])

# Calculate the equivalent levodopa dose for each administration
led_df['levodopa_equivalent_dose'] = led_df['dosage_quantity'] * led_df['drug_name'].map(conversion_factors)

# Calculate the total LED for each person within a specified period (e.g., 30 days)
led_by_person = led_df.groupby('person_id')['levodopa_equivalent_dose'].sum()

print(led_by_person)
