# Large Language Models can better predict optimal medication change in Parkinson's Disease through Medical notes

## Introduction

TODO

### Importing packages

In [None]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import matplotlib.dates as mdates

## Data Pre-processing

### Exploring the data

In [None]:
path = "data/HEHE.csv" 
available_memory = 423464092  # Memory in bytes
memory_per_row = 100  # Example memory us./vpm  age per row in bytes
target_memory_usage = 0.75 * available_memory
chunk_size = int(target_memory_usage / memory_per_row)
chunk_size

Original df that contains notes info

In [None]:
# Actual way to read large CSV files
chunk = pd.read_csv(path, chunksize=chunk_size, dtype={'dose_unit_source_value': str}) #Raises a DtypeWarning: Columns (11) have mixed types when it is not specified
df = pd.concat(chunk)

DataFrame with drug_exposure info

In [None]:
drug_exposure = pd.read_csv('data/TRY.csv')

Loading demographic info

In [None]:
demographics = pd.read_csv('data/DEMOGRAPHICS.csv') # Demographic information for PD patients only
demographics_all = pd.read_csv('data/DEMOGRAPHICS_ALL.csv') # Demographics info for all the patients

In [None]:
df.sample(5)

Calculating patient age

In [None]:
# Converting the "birth_datetime" column to a datetime format 
demographics["birth_datetime"] = pd.to_datetime(demographics["birth_datetime"], format='%Y-%m-%d')
demographics_all["birth_datetime"] = pd.to_datetime(demographics_all["birth_datetime"], format='%Y-%m-%d')

# Calculating the age by substracting the current date with the date in the dataframe
demographics["age"] = (pd.Timestamp('now') - demographics["birth_datetime"]).astype('timedelta64[Y]')
demographics_all["age"] = (pd.Timestamp('now') - demographics_all["birth_datetime"]).astype('timedelta64[Y]')

# Converting age to integer
demographics["age"]  = pd.to_numeric(demographics["age"], downcast='integer')
demographics_all["age"]  = pd.to_numeric(demographics_all["age"], downcast='integer')

# Dropping the birth_datetime (it's useless at this point) 
demographics = demographics.drop("birth_datetime", axis=1)
demographics_all = demographics_all.drop("birth_datetime", axis=1)

# Re-arranging the order of the columns
demographics = demographics[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value']]
demographics_all = demographics_all[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value']]

demographics

Actual len of the whole dataset. It has more than 2 Million rows

In [None]:
len(df)

#### Number of patients in this cohort

In [None]:
df['person_id'].nunique()

#### Dealing with the drug_source_value column

Because the drug_source_value column contains elements as json, we convert them into a dict so that we can use their original keys:

In [None]:
drug_source_value = df['drug_source_value'].apply(lambda x: json.loads(x))

We do the same for the `drug_source_value` column in the drug_exposure database

In [None]:
dsv_drug_exposure = drug_exposure['drug_source_value'].apply(lambda x: json.loads(x))

dsv_drug_exposure has already been filtered through SQL and contains Parkinson's drugs

In [None]:
dsv_drug_exposure

drug_source_value has the same len as the original df. The positions of the dictionary are the rows in the OG df 

In [None]:
drug_source_value[12] == json.loads(df.iloc[12]['drug_source_value'])

The keys of the dictionary are those used in the drug_source_value column

In [None]:
drug_source_value[0].keys()

In [None]:
drug_source_value[12]

Current medication used for the treatment of Parkinson's disease

In [None]:
# Defining the generic name of drugs used to treat Parkinson's disease
pd_medication = ["carbidopa", "levodopa", "entacapone", "tolcapone", "opicapone", "pramipexole", "ropinirole", "apomorphine", "rotigotine", "selegiline", "rasagiline", "safinamide", "amantadine", "istradefylline", "trihexyphenidyl", "benztropine", "bromocriptine", "cabergoline", "pergolide", "lisuride", 'benserazide']

We look for instances of the Parkinson's drugs contained in the pd_medication array in drug_source_value so that we can focus on the medical notes relating to Parkinson's.

In [None]:
pd_drug_info = {}
for i, drug in enumerate(drug_source_value):
    for item in pd_medication:
        # I'm still not sure why I can use drug as a dict here. Answer because drug_source_value is a dict
        if item in drug['med_display_name']:
            pd_drug_info[i] = drug['med_display_name'] # I could have also passed the whole dictionary

Something similar (extracting the med_display_name and using that instead of the whole json)

In [None]:
drug_info_drug_source_value = {}
for i, drug in enumerate(dsv_drug_exposure):
    drug_info_drug_source_value[i] = drug['med_display_name'] # I could have also passed the whole dictionary

In [None]:
pd_drug_info

In [None]:
drug_info_drug_source_value

Only 16K+ rows have information regarding medication for Parkinson's

In [None]:
len(pd_drug_info)

The keys of this new array are the indices or rows in the original DataFrame. These rows contain information regarding Parkinson's medication 

In [None]:
pd_drug_info.keys()

pd_drug_info is a subset of drug_source_value that contains the name of drugs related to PD

In [None]:
pd_drug_info[1299166]

In [None]:
drug_source_value[1299166]

Here's our new database. df_pd is a subset of the original dataset that only contains patients that are taking medication for Parkinson's

In [None]:
df_pd = df.iloc[list(pd_drug_info.keys())]
df_pd.sample(10)

Inserting the column drug_info into the pd dataset

In [None]:
df_pd.insert(loc=2, column="drug_info", value=drug_info_drug_source_value)

In [None]:
df_pd.sample(5)

In [None]:
df_pd.iloc[955]

Let's take a look at the drugs actually used in the dataset. **Note**: vscode truncakes the ouput so it could be that you won't see all the drugs. i.e: Cabergoline (1299166).

In [None]:
pd_drug_info

In [None]:
# pd_drugs_used is just the list version of lowercase values in pd_drug_info
pd_drugs_used = [item.lower() for item in pd_drug_info.values()]
# Takes the unique values and sorts them to finally be saved in an array
pd_drugs_used_unique = sorted(set(pd_drugs_used))

We do the same for the drugs in dsv_drug_exposure

In [None]:
drugs_used_drug_exposure = [item.lower() for item in drug_info_drug_source_value.values()]
drugs_used_drug_exposure_unique = sorted(set(drugs_used_drug_exposure))

In [None]:
drugs_used_drug_exposure_unique

**Note:** 1299166 is **NOT** the position in the array but rather the key that contains the element. The len for both structures in 16040  

In [None]:
pd_drug_info[1299166]

The position in the list that contains item with the key 1299166

In [None]:
list(pd_drug_info.keys()).index(1299166)

In [None]:
pd_drugs_used

In [None]:
pd_drugs_used_unique

In [None]:
drugs_used_drug_exposure_unique

Using a ***Regex*** to segment from the pd_drugs_used array the drug_name, generic_name and dosage

In [None]:
# Regex: ? 0 o 1 for the last element. + 1 or more. 
# Matches generic name (Word characters followed by either a space or hyphens) and brand name (A name)
drug_names_pattern = r"([\w\s-]+)\s(?:\(([\w\s-]+)\)\s*)?" 
dosage_pattern = r"\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)*(?:\s*(?:mg/ml|mg|ml))(?:/hr)?"

# Initialize empty lists
generic_names = []
brand_names = []
dosages = []

# Extract information and populate the lists
for string in pd_drugs_used:
    drug_name_match = re.findall(drug_names_pattern, string)
    if drug_name_match:
        generic_name, brand_name = drug_name_match[0]
        generic_names.append(generic_name)

        if brand_name:
            brand_names.append(brand_name)
        else:
            brand_names.append(np.nan)

        dosage_match = re.findall(dosage_pattern, string)
        
        if dosage_match:
            dosages.append(dosage_match[0])
        else:
            dosages.append(np.nan)
    else:
        # Nice way to remove outlier and keeping the length the same
        print(string)
        generic_names.append(np.nan)


# Create DataFrame
pd_data = pd.DataFrame({
    "generic_name": generic_names,
    "brand_name": brand_names,
    "dosage": dosages
})

*Regex* for drug_exposure

In [None]:
# Regex: ? 0 o 1 for the last element. + 1 or more. 
# Matches generic name (Word characters followed by either a space or hyphens) and brand name (A name)
drug_names_pattern = r"([\w\s-]+)\s(?:\(([\w\s-]+)\)\s*)?" 
dosage_pattern = r"\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)*(?:\s*(?:mg/ml|mg|ml))(?:/hr)?"

# Initialize empty lists
generic_names_de = []
brand_names_de = []
dosages_de = []

# Extract information and populate the lists
for string in drugs_used_drug_exposure:
    drug_name_match = re.findall(drug_names_pattern, string)
    if drug_name_match:
        generic_name, brand_name = drug_name_match[0]
        generic_names_de.append(generic_name)

        if brand_name:
            brand_names_de.append(brand_name)
        else:
            brand_names_de.append(np.nan)

        dosage_match = re.findall(dosage_pattern, string)
        
        if dosage_match:
            dosages_de.append(dosage_match[0])
        else:
            dosages_de.append(np.nan)
    else:
        # Nice way to remove outlier and keeping the length the same
        print(string)
        generic_names_de.append(np.nan)


# Create DataFrame
pd_data_drug_exposure = pd.DataFrame({
    "generic_name": generic_names_de,
    "brand_name": brand_names_de,
    "dosage": dosages_de
})

In [None]:
pd_data.sample(20)

In [None]:
pd_data[pd_data['generic_name'] == 'benserazide']

In [None]:
pd_data_drug_exposure.sample(20)

Error or inconsistency. I might look into this later

In [None]:
# There's an inconsistency with this. The RxCUI code associated with it (885205) actually shows 1 mg not 0.5. See: https://mor.nlm.nih.gov/RxNav/search?searchBy=RXCUI&searchTerm=885205
df_pd['drug_source_value'].iloc[3013]

In [None]:
pd_data.iloc[3013]

In [None]:
# Notice that for selegiline the generic_name still contains the dosage and form of administration. This is because, for selegiline, the name string doesn't contain the brand name   
pd_data.iloc[13628]

#### Dealing with missing data

In [None]:
# Taking a look at the rows where there are missing values 
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

Using the pd_drugs_used_unique list to fill missing values que realize that the brand name for the drug is actualy duopa

In [None]:
pd_drugs_used[5187:5199]

In [None]:
# Duopa is a carbodipa-levodopa intestinal gel used for PD
pd_drugs_used[7346:7390]

In [None]:
# Regex couldn't catch the brand name because it was in a different position, so we add it manually
pd_data.loc[pd_data.index[5187:5199], 'brand_name'] = 'duopa'
# Patient is taking LCIG (Levodopa-cabidopa intestinal gel), most likely it is duopa (this is an assumption) -> It's actually true
# TODO: Maybe replace this with Dopamine Replacement Therapy (Listen again to Dr. K's audio)
pd_data.loc[pd_data.index[7346:7390], 'brand_name'] = 'duopa'
# pd_data.loc[pd_data.index[7712:7725], 'brand_name'] = 'duopa' #durg_exposure_file

In [None]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

Looking for evidence of other brand names for Selegiline in our database

In [None]:
selegiline = pd_data[(pd_data['brand_name'] == 'eldepryl')| (pd_data['brand_name'] == 'zelapar')]
selegiline['brand_name'].unique()

In [None]:
print(selegiline.to_string())

Making sure the word "tablet" isn't present in the drug's generic name


In [None]:
for index, name in enumerate(generic_names): #This one refers to drug_exposure
    if 'tablet' in name:
        print(name, index)

Blindfully assuming that the medication taken by the patient was in fact eldepryl. TODO: Listen again to an audio and check for DPT (Dopamine replacement Therapy)

In [None]:
pd_data.loc[pd_data.index[13628:13640], 'brand_name'] = 'eldepryl'
# Replacing "selegiline tablet 5mg for just selegiline"
pd_data.loc[pd_data.index[13628:13640], 'generic_name'] = 'selegiline'

TODO: For "carbidopa-levodopa patient own med" we aren't sure about the medication (sinemet, sinemet cr, etc) let's look at the rxnorm codes to see if they are informative

In [None]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

In [None]:
missing_info_index = pd_data[pd_data.isnull().any(axis=1)].index

In [None]:
for i in range(len(missing_info_index)):
    print(df_pd.iloc[missing_info_index[i]]['drug_source_value'])

A quick look into rxnorm codes 197444 and 1599846 reveals that the medication is actually sinemet (25-100 MG carbidopa-levodopa) and duopa (4.63-20 mg/ml carbidopa-levodopa). Let's update this information accordingly

In [None]:
# Let's see how the past rows containing sinemet look like so that the information is similar
pd_data[pd_data['brand_name'] == 'sinemet']

In [None]:
missing_info_index[:3]

In [None]:
pd_data.loc[missing_info_index[:3], 'generic_name'] = 'carbidopa-levodopa'
pd_data.loc[missing_info_index[:3], 'brand_name'] = 'sinemet'
pd_data.loc[missing_info_index[:3], 'dosage'] = '25-100 mg'

In [None]:
pd_data.loc[missing_info_index[:3]]

Taking a look at how the rows containing duopa look like

In [None]:
# Let's see how the past rows containing sinemet look like so that the information is similar
pd_data[pd_data['brand_name'] == 'duopa']

Filling in the values for duopa

In [None]:
pd_data.loc[missing_info_index[3:], 'generic_name'] = 'carbidopa-levodopa'
pd_data.loc[missing_info_index[3:], 'brand_name'] = 'duopa'
pd_data.loc[missing_info_index[3:], 'dosage'] = '4.63-20 mg/ml'

In [None]:
pd_data.loc[missing_info_index[3:]]

In [None]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

In [None]:
pd_data[pd_data['brand_name'] == 'duopa']

Let's switch the order of inv levodopa-carbidopa intestinal gel to inv carbidopa-levodopa intestinal gel and the dosage too

In [None]:
pd_data.loc[7346:7368, 'generic_name'] = 'inv carbidopa-levodopa intestinal gel'
pd_data.loc[7346:7368, 'dosage'] = '5-20 mg/ml'

In [None]:
pd_data.loc[7346:7368]

In [None]:
pd_data.loc[pd_data.index[13628:13640]]

In [None]:
pd_data[pd_data['generic_name'] == 'selegiline']

In [None]:
print(pd_data[pd_data.isnull().any(axis=1)].to_string())

In [None]:
pd_data[pd.isnull(pd_data['brand_name'])]

In [None]:
pd_data[pd.isnull(pd_data['dosage'])]

In [None]:
pd_data[pd.isnull(pd_data['generic_name'])]

In [None]:
len(pd_data)

In [None]:
df_pd.insert(loc=3, column="generic_name", value=pd_data['generic_name'].to_list())
df_pd.insert(loc=4, column="brand_name", value=pd_data['brand_name'].to_list())
df_pd.insert(loc=5, column="dosage", value=pd_data['dosage'].to_list())

In [None]:
df_pd.sample(20)

### Dealing with missing data for pd_data_drug_exposure 

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

For carbidopa-levodopa er (extended release) the dose strength definitely matches with that of Rytary; we will update the brand name as is

In [None]:
pd_data_drug_exposure['brand_name'].iloc[3573:3659] = 'rytary'

In [None]:
pd_data_drug_exposure[3573:3659]

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

Once we take a look at the original array we see that the regex couldn't catch the brand_name (duopa). We will update this manually

In [None]:
drugs_used_drug_exposure[5994:6009]

In [None]:
pd_data_drug_exposure['brand_name'].iloc[5994:6009] = 'duopa'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

Duopa is also used as an intestinal gel. We will update this columns accordingly

Duopa is presented most of the times as either 4.63-20mg/ml or 5-20mg/ml. I'm gonna blindfully assume that the conncentration is 5-20 

In [None]:
pd_data_drug_exposure['brand_name'].iloc[8137:8150] = 'duopa'
# If the drug is asctually duopa
pd_data_drug_exposure['brand_name'].iloc[8143:8150] = 'duopa'
pd_data_drug_exposure['generic_name'].iloc[8143:8150] = 'duopa'

# Changing order for dosage and generic_name of levodopa-carbidopa 
pd_data_drug_exposure['dosage'].iloc[8137:8143] = '5-20 mg/ml'
pd_data_drug_exposure['generic_name'].iloc[8137:8143] = 'inv carbidopa-levodopa intestinal gel'

pd_data_drug_exposure['dosage'].iloc[8143:8150] = '5-20 mg/ml'
pd_data_drug_exposure['generic_name'].iloc[8143:8150] = 'inv carbidopa-levodopa intestinal gel pump'

In [None]:
pd_data_drug_exposure[pd_data_drug_exposure['brand_name'] == 'duopa']

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

In [None]:
for index, item in enumerate(pd_data_drug_exposure['generic_name']):
    if(item == 'carbidopa-levadopa 25-100 mg orally disintegrating tablet'):
        pd_data_drug_exposure['generic_name'].iloc[index] = 'carbidopa-levodopa'
    if(item == 'carbidopa-levodopa patient own'):
        pd_data_drug_exposure['generic_name'].iloc[index] = 'carbidopa-levodopa'
    if(item == 'pramipexole dihydrochloride tb24'):
        pd_data_drug_exposure['generic_name'].iloc[index] = 'pramipexole'
    if(item == 'selegiline tablet 5 mg'):
        pd_data_drug_exposure['generic_name'].iloc[index] = 'selegiline'
    if(item == 'rasagiline 1mg tablet-'):
        pd_data_drug_exposure['generic_name'].iloc[index] = 'rasagiline'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

In [None]:
pd_data_drug_exposure['generic_name'].unique()

In [None]:
pd_data_drug_exposure['generic_name'].iloc[9136]

Parcopa matches the description of orally disintegrating tablet with strength 25-100mg

In [None]:
# drugs_used_drug_exposure
pd_data_drug_exposure['brand_name'].iloc[9136:9148] = 'parcopa'

TODO: fIX TIS FOR SELEGILINE AND OTHERS

In [None]:
pd_data_drug_exposure.iloc[8730]

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

There's not enough information in the regex or the full drug description to know what is the asctua dosage or strength of Ropinirole. Would have to look deeper into notes

In [None]:
drugs_used_drug_exposure[6475:6482+1]

In [None]:
drugs_used_drug_exposure[6658:6663+1]

Dealing with selegiline

In [None]:
selegiline = pd_data_drug_exposure[(pd_data_drug_exposure['brand_name'] == 'eldepryl')| (pd_data['brand_name'] == 'zelapar')]
selegiline['brand_name'].unique()

In [None]:
pd_data_drug_exposure['generic_name'].iloc[20843:20849] = 'selegiline'
pd_data_drug_exposure['brand_name'].iloc[20843:20849] = 'eldepryl'
pd_data_drug_exposure['generic_name'].iloc[8730] = 'selegiline'
pd_data_drug_exposure['brand_name'].iloc[8730] = 'eldepryl'

Dealing with Rasagiline

In [None]:
pd_data_drug_exposure['generic_name'].iloc[10477:10479] = 'rasagiline'
pd_data_drug_exposure['brand_name'].iloc[10477:10479] = 'azilect'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

Figuring what's going on with Paramipexole AKA Mirapex

In [None]:
drug_exposure.iloc[10449]

In [None]:
drug_exposure['drug_source_value'].iloc[10449]

In [None]:
drug_exposure['person_id'].iloc[10449]

In [None]:
pd_data_drug_exposure[(pd_data_drug_exposure['generic_name'] == 'pramipexole')]

In [None]:
pd_data_drug_exposure[(pd_data_drug_exposure['generic_name'] == 'pramipexole') & (pd_data_drug_exposure['dosage'] == '1 mg')]

In [None]:
drug_exposure.iloc[933]

In [None]:
drug_exposure[drug_exposure['person_id'] == 260872]['drug_source_value'].unique()

In [None]:
# Patient doesn't reach maximum dose a day (4.5  mg/day)
# MD pramipexole (MIRAPEX) 1 MG PO Tablet Take 1 tablet by mouth 4 times daily. Take at 5am-10 am -3 pm and 8 pm change in dose
df_pd[df_pd['person_id'] == 260872]['note_text'].iloc[2]

In [None]:
drug_exposure[drug_exposure['person_id'] == 222876]['drug_source_value'].unique()

In [None]:
dani = drug_exposure[drug_exposure['person_id'] == 222876]['drug_source_value'].unique()[4]

In [None]:
dani

In [None]:
pomar = drug_exposure['drug_source_value'].iloc[0]

In [None]:
len(drug_exposure)

In [None]:
drug_exposure[drug_exposure['drug_source_value'] == dani]['drug_exposure_start_datetime'].unique()

In [None]:
df_pd[df_pd['brand_name'] == 'mirapex']['dosage'].unique()

In [None]:
pd_data_drug_exposure['brand_name'].iloc[10449:10477] = 'mirapex'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

In [None]:
drug_exposure.iloc[10454:10460+1]['drug_source_value'].iloc[0]

In [None]:
pd_data_drug_exposure[pd_data_drug_exposure['brand_name'] == 'mirapex']['dosage'].unique()

Rxnorm 901550 is actually 24 HR pramipexole dihydrochloride 1.5 MG Extended Release Oral Tablet ...

In [None]:
pd_data_drug_exposure['dosage'].iloc[10454:10460+1] = '1.5 mg'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

Looking at the rxnorm codes for ropinirole (824959, 799056) we find out that the drug strength is 12 and 4 mg 

In [None]:
# 824959
# 799056
drug_exposure['drug_source_value'].iloc[6663]

In [None]:
pd_data_drug_exposure['dosage'].iloc[6475: 6483] = '12 mg'
pd_data_drug_exposure['dosage'].iloc[6658: 6664] = '4 mg'

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

In [None]:
drug_exposure['drug_source_value'].iloc[3962]

Rxnorm code 197444 shows that the drug is Carbidopa-Levodopa 25-100 mg. We will update that drug accordingly

In [None]:
# Looking for a row with carbidopa-levodopa so that I can insert the new register in a similar fashion
pd_data_drug_exposure[pd_data_drug_exposure['generic_name'] == 'carbidopa-levodopa']

In [None]:
pd_data_drug_exposure['generic_name'].iloc[3962] = 'carbidopa-levodopa'
pd_data_drug_exposure['brand_name'].iloc[3962] = 'sinemet'
pd_data_drug_exposure['dosage'].iloc[3962] = '25-100 mg'

We have finally dealt with missing values

In [None]:
print(pd_data_drug_exposure[pd_data_drug_exposure.isnull().any(axis=1)].to_string())

Using the original drug_exposure to fill in the blanks ...

In [None]:
drug_exposure['person_id'].iloc[10455]

In [None]:
df_pd[df_pd['person_id'] == 222876]['note_text'].iloc[0]

### Creating the corpus from medical notes 

#### Using the note_text column for raw data

In [None]:
corpus_raw = list(df_pd["note_text"])

#### Removing words that don't contain much meaning from our notes

In [None]:
words_to_remove = ["Department of Neurosurgery Date of Consult", "Department of Orthopedics Consultation Note Date of Consult", "Geriatric Medicine Consult Date of Consult", "INPATIENT MEDICAL NUTRITION THERAPY", "MSW", "RN" ,"evidence", "Read By", "images", "report", "concur", "findings", "agree", "seen", "residents", "resident", "Resident", "unspecified provider", "Released Date Time", "personally reviewed" ,"D.O", "MD", "M.D.", "Electronically Verified By", "NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]
# words_to_remove = ["NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]

corpus_clean = []
for item in corpus_raw:
    for word in words_to_remove:
        item = item.replace(word, '') 
    corpus_clean.append(item)

#### Taking each word from the cleaned corpus and making it lowercase

In [None]:
corpus = [word.lower() for word in corpus_clean]
corpus[989]

#### Adding the pre-processed version of the notes to the DataFrame

In [None]:
df_pd.loc[:, "note_text"] = corpus
df_pd

Looking for ocurrences of the word "updrs"or "motor scale" in the database

In [None]:
keywords = ['updrs', 'motor scale total', 'motor examination']
indexes = []
for i, item in enumerate(corpus):
    for word in keywords:
        if word in item:
            indexes.append(i)    

In [None]:
indexes

Filtering out those rows

In [None]:
df_pd.iloc[indexes]

Only in 4 patients there's mention of the updrs or motor scale 

In [None]:
df_pd.iloc[indexes]['person_id'].unique()

Seeing info regarding rows where updrs and motor scale are mentioned

In [None]:
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', None)

In [None]:
# TODO: Come back here and read the whole thing
df_pd.iloc[indexes]['note_text']

In [None]:
demographics.sample(5)

In [None]:
demographics_all.sample(5)

We merge df_pd with demographics based in the person_id. This is because, initially, df_pd only contains drug information, not patient information

In [None]:
df_pd = df_pd.merge(demographics, on='person_id', how='left')
df_pd = df_pd[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'drug_source_value', 'drug_info', 'generic_name', 'brand_name', 'dosage', 'quantity', 'sig', 'route_source_value', 'dose_source_value', 'dose_unit_source_value', 'note_text' ]]

In [None]:
df_pd

### Some statistics

In [None]:
df_pd['age'].describe()

In [None]:
demographics_all['age'].describe()

In [None]:
pd_data.describe()

In [None]:
demographics.columns.values

## Plots and statistics

In [None]:
demographics_all[demographics_all['gender_source_value'] == 'UNKNOWN']

In [None]:
demographics_all['gender_source_value'].value_counts(normalize=True)

In [None]:
demographics['gender_source_value'].value_counts(normalize=True)

In [None]:
# Plotting the distribution of gender for PD patients vs. all patients
gender_counts = demographics['gender_source_value'].value_counts(normalize=True) * 100
gender_counts_all = demographics_all['gender_source_value'].value_counts(normalize=True)[:2] * 100 # Leaves out 'UNKNOWN' only 1 patient

# Combine percentages into a single DataFrame
combined_percentages = pd.DataFrame({
    "Parkinson's Patients": gender_counts,
    "All Patients": gender_counts_all
}).fillna(0)  # Fill NaN values with 0 in case some genders are missing in either group

# Plotting the percentage distribution of gender for both groups side by side
fig, ax = plt.subplots(figsize=(10, 6))

# Color map for the bar chart
colors = plt.cm.tab20.colors

# Calculate the width for each bar group
bar_width = 0.35

# Create the x positions for the bars
x = range(len(combined_percentages.index))

# Plot bars for Parkinson's patients
parkinsons_bars = ax.bar(x, combined_percentages["Parkinson's Patients"], width=bar_width, label="Parkinson's Patients", color=colors[0])

# Plot bars for all patients (shifted by bar_width)
all_patients_bars = ax.bar([i + bar_width for i in x], combined_percentages["All Patients"], width=bar_width, label="All Patients", color=colors[1])

# Set plot title and labels
ax.set_title("Distribution of Gender: Parkinson's Patients vs All Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Gender', fontsize=12)
ax.set_ylabel('Percentage', fontsize=12)
ax.legend(fontsize=12)

# Set the x-axis ticks to be at the center of the bars
ax.set_xticks([i + bar_width / 2 for i in x])
ax.set_xticklabels(combined_percentages.index, fontsize=12)

# Add the percentage values inside each bar
for bar1, bar2 in zip(parkinsons_bars, all_patients_bars):
    height1 = bar1.get_height()
    height2 = bar2.get_height()
    ax.text(bar1.get_x() + bar1.get_width() / 2, height1 / 2, f"{height1:.1f}%", ha='center', va='center', fontsize=12)
    ax.text(bar2.get_x() + bar2.get_width() / 2, height1 / 2, f"{height2:.1f}%", ha='center', va='center', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
demographics['gender_source_value'].value_counts(normalize=False) 

In [None]:
demographics_all['gender_source_value'].value_counts(normalize=False) 

In [None]:
# Sample data (replace this with your actual data from the DataFrame)
race_counts = demographics['race_source_value'].value_counts(normalize=True) * 100

# Create a DataFrame from the data
race_df = pd.DataFrame({'Percentage': race_counts})

# Sort the DataFrame by percentage in descending order
race_df = race_df.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = race_df['Percentage'].plot(kind='barh', figsize=(16, 6), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title('Race Distribution', fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12)
plt.ylabel('Race', fontsize=12)

# Add labels to the right of each bar
for i, v in enumerate(race_df['Percentage']):
    plt.text(v + 1, i, f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels
plt.yticks([])
plt.xticks(fontsize=12)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, race_counts.index)]

# Show the custom legend
plt.legend(handles=legend_patches, loc='best')

plt.tight_layout()
plt.show()

In [None]:
demographics['ethnicity_source_value'].value_counts()

In [None]:
race_counts

In [None]:
# Calculate the total number of patients in each dataset
total_patients_df = len(df['person_id'].unique())
total_patients_df_pd = len(df_pd['person_id'].unique())
total_patients_demographics_all = len(demographics_all['person_id'].unique())

# Calculate the percentage of PD patients from the whole cohort and PD patients on medication
percentage_pd_patients_df = (total_patients_df / total_patients_demographics_all) * 100
percentage_pd_patients = (total_patients_df_pd / total_patients_demographics_all) * 100

# Create the data for the horizontal bar plot
categories = ['PD Patients from\nWhole Cohort', 'PD Patients on\nMedication']
values = [percentage_pd_patients_df, percentage_pd_patients]

# Set the figure size for the horizontal bar plot
plt.figure(figsize=(14, 6))

# Plot the horizontal bar plot
plt.barh(categories, values, color=['blue', 'green'])
plt.xlabel('Percentage of Patients', fontsize=12.5)
plt.title('Percentage of PD Patients taking in-hospital medication vs. PD Patients overall (relative to total number of all patients)', fontsize=15, fontweight='bold')
plt.ylabel('')
plt.xticks([])

# Display the percentage values inside the bars
for i, v in enumerate(values):
    plt.text(v + 0.0005, i, f"{v:.2f}%", ha='left', va='center', fontsize=12.5)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
percentage_pd_patients_df

In [None]:
# Plotting value counts of race with percentages
ethnicity = demographics['ethnicity_source_value'].value_counts()
plt.figure(figsize=(12, 8))  # Adjust the values as per your desired size
labels = ethnicity.index
colors = plt.cm.tab20(np.arange(len(labels))) 
ax = ethnicity.plot(kind='bar', color=colors, edgecolor='black', linewidth=0.5)

# Calculate the percentages
total_count = ethnicity.sum()
percentages = [(count / total_count) * 100 for count in ethnicity]

# Display the percentages inside the bars
ax.bar_label(ax.containers[0], labels=[f"{percentage:.1f}%" for percentage in percentages], label_type='center', fontsize=12.5)

plt.title('Distribution of Ethnicity', fontsize=15, fontweight='bold')
plt.xlabel('Ethnicity')
plt.ylabel('Count')
plt.xticks(range(len(labels)), labels, rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()

Actual actual way to deal with this lol

In [None]:
# Plotting the distribution of age for both groups using a box plot
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the box plot
ax.boxplot([demographics['age'], demographics_all['age']], labels=["Parkinson's Patients", "All Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Parkinson's Patients and All Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
male_pd_patients_by_age = demographics[demographics['gender_source_value'] == 'MALE']['age']
male_pd_patients_by_age_all = demographics_all[demographics_all['gender_source_value'] == 'MALE']['age']# Plotting the distribution of age for both groups using a box plot

fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age, male_pd_patients_by_age_all], labels=["Male Parkinson's Patients", "All Male Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male Parkinson's Patients and All Male Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
female_pd_patients_by_age = demographics[demographics['gender_source_value'] == 'FEMALE']['age']
female_pd_patients_by_age_all = demographics_all[demographics_all['gender_source_value'] == 'FEMALE']['age']

fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([female_pd_patients_by_age, female_pd_patients_by_age_all], labels=["Female Parkinson's Patients", "All Female Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Female Parkinson's Patients and All Female Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age, female_pd_patients_by_age], labels=["Male Parkinson's Patients", "Female Parkinson's Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male and Female Parkinson's Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
# Plot the box plot
ax.boxplot([male_pd_patients_by_age_all, female_pd_patients_by_age_all], labels=["All Male Patients", "All Female Patients"], patch_artist=True)

# Set plot title and labels
ax.set_title("Distribution of Age: Male and Female Patients", fontsize=15, fontweight='bold')
ax.set_xlabel('Patient Group', fontsize=12)
ax.set_ylabel('Age', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
pd_data.value_counts(normalize=True)

In [None]:
pd_data['brand_name'].value_counts(normalize=True)

## Distribution of drugs

In [None]:
drug_distribution_count = df_pd['generic_name'].value_counts(normalize=True) * 100
drug_distribution = pd.DataFrame({'Percentage': drug_distribution_count})

# Sort the DataFrame by percentage in descending order
drug_distribution = drug_distribution.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = drug_distribution['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Distribution of medication (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(drug_distribution['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.2f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(drug_distribution)), drug_distribution.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, drug_distribution.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()

In [None]:
len(pd_data) == len(df_pd)

In [None]:
df_pd.iloc[5187]

In [None]:
pd_data.iloc[5187]

In [None]:
len(pd_data[pd_data['brand_name'] == 'duopa']) == len(df_pd[df_pd['brand_name'] == 'duopa'])

In [None]:
# vlalue_counts: Number of times each unique elements appears
medication_data = pd_data['brand_name'].value_counts(normalize=True)
medication_data.plot(kind='barh')

In [None]:
df_pd.groupby('brand_name')['person_id'].nunique()

In [None]:
df_pd['brand_name'].nunique()

In [None]:
df_pd.columns

In [None]:
# Drug usage per patient
drug_percentage = df_pd.groupby('race_source_value')['person_id'].nunique()/ df_pd['person_id'].nunique()
drug_percentage.sort_values(ascending=False)
# drug_percentage

In [None]:
drug_percentage = df_pd.groupby('generic_name')['person_id'].nunique()/ df_pd['person_id'].nunique()
drug_popularity_count = drug_percentage * 100
drug_popularity = pd.DataFrame({'Percentage': drug_popularity_count})

# Sort the DataFrame by percentage in descending order
drug_popularity = drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Parkison's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(drug_popularity)), drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()


In [None]:
# Assuming df_pd is a DataFrame containing drug data and 'gender_source_value' is a column representing gender

# Filter the DataFrame for 'gender_source_value' == 'MALE'
male_pd = df_pd[df_pd['gender_source_value'] == 'MALE']

# Calculate the drug percentage for males
male_drug_percentage = male_pd.groupby('generic_name')['person_id'].nunique() / male_pd['person_id'].nunique()

# Calculate the percentage popularity of each drug for males
male_drug_popularity_count = male_drug_percentage * 100
male_drug_popularity = pd.DataFrame({'Percentage': male_drug_popularity_count})

# Sort the DataFrame by percentage in descending order
male_drug_popularity = male_drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = male_drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Male Parkinson's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(male_drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(male_drug_popularity)), male_drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, male_drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()


In [None]:
# Assuming df_pd is a DataFrame containing drug data and 'gender_source_value' is a column representing gender

# Filter the DataFrame for 'gender_source_value' == 'MALE'
female_pd = df_pd[df_pd['gender_source_value'] == 'FEMALE']

# Calculate the drug percentage for males
female_drug_percentage = female_pd.groupby('generic_name')['person_id'].nunique() / female_pd['person_id'].nunique()

# Calculate the percentage popularity of each drug for males
female_drug_popularity_count = female_drug_percentage * 100
female_drug_popularity = pd.DataFrame({'Percentage': female_drug_popularity_count})

# Sort the DataFrame by percentage in descending order
female_drug_popularity = female_drug_popularity.sort_values(by='Percentage', ascending=False)

# Create a horizontal bar chart with percentages as labels
ax = female_drug_popularity['Percentage'].plot(kind='barh', figsize=(18, 10), color=plt.cm.tab20.colors)

# Set the title and labels
plt.title("Proportion of Female Parkinson's patients receiving each drug (Generic name)", fontsize=15, fontweight='bold')
plt.xlabel('Percentage', fontsize=12.5)
plt.ylabel('')

# Add labels to the right of each bar
for i, v in enumerate(female_drug_popularity['Percentage']):
    plt.text(v + 0.0005, i, ' ' + f"{v:.1f}%", ha='left', va='center', fontsize=12.5)

# Remove y-axis labels and set drug names in their original order
plt.yticks(range(len(female_drug_popularity)), female_drug_popularity.index, fontsize=12.5)

plt.xticks(fontsize=12.5)

# Create custom legend elements
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(plt.cm.tab20.colors, female_drug_popularity.index)]

# Show the custom legend
# plt.legend(handles=legend_patches, loc='best', fontsize=12.5)

plt.tight_layout()
plt.show()

In [None]:
female_drug_popularity['Percentage']

In [None]:
# Assuming you already have these variables as pandas Series
# male_drug_popularity and female_drug_popularity with 'Percentage' as the data and medication names as the index.

# Combine both male and female data for sorting
combined_data = pd.concat([male_drug_popularity, female_drug_popularity], axis=1)
combined_data.columns = ['Males', 'Females']

# Sort the medications based on the sum of percentages (sorting from higher to lower)
sorted_combined = combined_data.sum(axis=1).sort_values(ascending=False)

# Plotting the data as a horizontal bar plot
plt.figure(figsize=(16, 8))  # Adjust the figure size as needed

# Increase the width of the bars and adjust figure size for better spacing
width = 0.4

# Bar plot for males
ind_male = range(len(sorted_combined))
bars_male = plt.barh(ind_male, combined_data.loc[sorted_combined.index, 'Males'], height=width, label='Males', color='tab:gray')

# Bar plot for females
ind_female = [i + width for i in ind_male]
bars_female = plt.barh(ind_female, combined_data.loc[sorted_combined.index, 'Females'], height=width, label='Females', color='tab:pink')

# Adding percentages to the side of the bars
for i, val in enumerate(combined_data.loc[sorted_combined.index, 'Males']):
    plt.text(val, i, f'{val:.2f}%', va='center', fontsize=10.5, color='black')

for i, val in enumerate(combined_data.loc[sorted_combined.index, 'Females']):
    plt.text(val, i + width, f'{val:.2f}%', va='center', fontsize=10.5, color='black')

# Adding labels, titles, and customizing the plot
plt.xlabel('Percentage')
plt.ylabel('Medication')
plt.title('Prescribed medication for Male vs. Female Parkinson\'s patients', fontsize=15, fontweight='bold')
plt.yticks([i + width / 2 for i in ind_male], sorted_combined.index, fontsize=10)
plt.gca().invert_yaxis()
plt.legend(loc='lower right')  # Move the legend to upper right for better spacing
plt.tight_layout()
plt.show()


In [None]:
df_pd['generic_name'].value_counts(normalize=True) * 100

In [None]:
drug_percentage

## Calculating LED (Levodopa Dose Equivalent) for Parkinson's patients

In [None]:
# sig stablished how the medication should be taken by the patient
formatted_series = df_pd['dose_source_value'].unique()
format_func = np.vectorize(lambda x: '{:.4}'.format(x))
bla = np.sort(format_func(formatted_series))
bla

In [None]:
drug_exposure

In [None]:
rasagiline_index = pd_data_drug_exposure[pd_data_drug_exposure['dosage'] == '1mg'].index
pd_data_drug_exposure.loc[rasagiline_index, 'dosage'] = '1 mg'
pd_data_drug_exposure.iloc[rasagiline_index]['dosage']

In [None]:
entacapone_index = pd_data_drug_exposure[pd_data_drug_exposure['dosage'] == '200mg'].index
pd_data_drug_exposure.loc[entacapone_index, 'dosage'] = '200 mg'
pd_data_drug_exposure.iloc[entacapone_index]['dosage']

In [None]:
pd_data_drug_exposure['dosage'].unique()

In [None]:
pd_data_drug_exposure['dosage'].unique()[0].split('-')[1].split()[0]

In [None]:
led_dose = []
for index, item in enumerate(pd_data_drug_exposure['dosage']):
    if '-' in item:
        dosage = item.split('-')[1].split()[0]
    else:
        dosage = item.split()[0]
    led_dose.append(dosage)

In [None]:
len(pd_data_drug_exposure)

In [None]:
set(sorted(led_dose))

In [None]:
for index, item in enumerate(pd_data_drug_exposure['generic_name']):
    if 'levodopa-carbidopa' in item:
        print(item)

In [None]:
pd_data_drug_exposure['generic_name'].unique()

In [None]:
pd_data_drug_exposure

In [None]:
drug_exposure

In [None]:
led_df = pd.concat([drug_exposure.iloc[:, 0:3], pd_data_drug_exposure, drug_exposure.iloc[:, 3:]], axis=1)

In [None]:
led_df.insert(loc=6, column="led_dose", value=led_dose)
led_df.insert(loc=3, column="drug_info", value=drug_info_drug_source_value)
# Properly handling the led_dose and drug_exposure_start_datetime colums
led_df['led_dose'] = pd.to_numeric(led_df['led_dose'], errors='coerce') # Coverting this column to a float type and handling uncommon values
led_df['drug_exposure_start_datetime'] = pd.to_datetime(led_df['drug_exposure_start_datetime'])

In [None]:
len(drug_exposure)

In [None]:
len(pd_data_drug_exposure)

In [None]:
drug_exposure

In [None]:
led_df

Esta bieeeeeeeeeeeeeeeeeeen y funciona

In [None]:
dl = [json.loads(x) for x in led_df[led_df['dose_source_value'] == 0.0]['drug_source_value']]
for index, tem in enumerate(dl):
    print(index, led_df['person_id'].iloc[index], dl[index]['mar_action'], led_df['drug_source_value'].iloc[index])
len(dl)

In [None]:
type(led_df[led_df['dose_source_value'] == 0.0]['drug_source_value'])

In [None]:
type(led_df.loc[led_df['dose_source_value'] == 0.0, ['drug_source_value']])

In [None]:
bla = [json.loads(x) for x in led_df[led_df['dose_source_value'] == 0.0]['drug_source_value']]
for index, item in enumerate(bla):
    print(bla[index]['mar_action'])

We lose 18.9% percent of patients because their medication was held

In [None]:
print(len(dl)*100/len(df_pd))

In [None]:
led_df['dosage'].unique()

In [None]:
pd_data_drug_exposure[pd_data_drug_exposure['dosage'] == '0.5 mg']

In [None]:
pd_data_drug_exposure['dosage'].unique()

In [None]:
led_df['dosage'].unique()

In [None]:
set(led_dose)

In [None]:
led_df['generic_name'].unique()

In [None]:
led_df[(led_df['brand_name'] == 'rytary') & (led_df['generic_name'] == 'carbidopa-levodopa')]

In [None]:
led_df[led_df['dose_unit_source_value'] == 'tablet']['dose_source_value']

In [None]:
led_df[led_df['dose_unit_source_value'] == 'each']#['dose_source_value'].unique()

In [None]:
led_df[led_df['dose_unit_source_value'] != 'tablet'].sample(20)

Difference betwen ryrary and duopa??
Rytary is carbidopa-levodopa extended release and suopa is carbidopa-levodopa enteral suspension

In [None]:
sorted(led_df['generic_name'].unique())

Setting conversion factors for Parkinson's drugs and calculating LED

In [None]:
# Conversion factors from: https://movementdisorders.onlinelibrary.wiley.com/doi/full/10.1002/mds.29410
conversion_factors = {
    'amantadine': 1.0,
    'amantadine er': 1.25, 
    'apomorphine': 10.0,
    # trihexyphenidyl and benztropine aren't as unified but the paper proposes a conversion factor of 1
    'benztropine': 1.0,
    'benztropine mesylate': 1.0,
    'bromocriptine': 10.0,
    'cabergoline': 66.7,
    'carbidopa-levodopa': 1.0,
    'inv carbidopa-levodopa intestinal gel': 1.0,
    'inv carbidopa-levodopa intestinal gel pump': 1.0,
    'carbidopa': 0.1,
    'carbidopa-levodopa er': 0.5,
    'carbidopa-levodopa-entacapone': 1.33,
    'entacapone': 1.33, # Double-check this
    'pramipexole': 100.0,
    'pramipexole er': 100.0,
    'trihexyphenidyl': 1.0,
    'rasagiline': 100.0,
    ' rasagiline mesylate': 100.0,
    'ropinirole': 20.0,
    'rotigotine': 30.0, 
    'selegiline': 10.0, # All the rows relating to selegiline are oral
    'tolcapone': 1.5,
    'ropinirole': 0.5,
}

# Convert 'drug_exposure_start_datetime' to datetime format
def calculate_led(row):
    """ The logic goes as follows: We will map the conversion factors to the generic_name found in the led_df dataframe. All of the generic_names are contained into the dictionary.
    We check the led_dose calculated. If it's equal to the dose_source_value it means that the medication is probably carbidopa-levodopa, so we will skip it and we do that by setting dose_source_value to 1.
    That way when we multiply it will be as if nothing happened.

    Then we check if dose_source_value is zero. If that happends, then that drug was not administered but held (look at mar_action in the drug_source column). If it's zero we put the equivalent to np.nan into that row

    If everything goes well and the dose_source_value is actually something we can work with, then the led will be the multiplication of the dose_sourve_value, the led_dose and the conversion factor

    If the dose_source_value is something we can't work with, the led will exclude that info from the multiplication.

    At the end we apply the function into a new column called led. 
    """
    dose_source_value = row['dose_source_value']
    led_dose = row['led_dose']
    conversion_factor = conversion_factors.get(row['generic_name'], 0)
    if led_dose == dose_source_value:
        dose_source_value = 1.0
    if dose_source_value == 0.0:
        return None
    if pd.notna(dose_source_value):
        return dose_source_value * row['led_dose'] * conversion_factor
    else:
        return row['led_dose'] * conversion_factor

led_df['led'] = led_df.apply(calculate_led, axis=1)

Adding diagnosis date

In [None]:
diagnosis_date = pd.read_csv('data/DIAGNOSIS_DATE.csv')
diagnosis_date["diagnosis_date"] = pd.to_datetime(diagnosis_date["diagnosis_date"], format='%Y-%m-%d')

led_df = led_df.merge(diagnosis_date, on='person_id', how='inner')
led_df = led_df[['person_id', 'drug_exposure_start_datetime', 'diagnosis_date', 'drug_source_value', 'drug_info', 'generic_name', 'brand_name', 'dosage', 'led_dose', 'dose_source_value', 'dose_unit_source_value', 'route_source_value','visit_occurrence_id', 'visit_detail_id', 'led']]

Mean LED by person

In [None]:
led_by_person = led_df.groupby('person_id')['led'].mean()
# Plot the data
plt.figure(figsize=(15, 10))
led_by_person.plot(kind='bar')
plt.xlabel('Person ID')
plt.ylabel('Total LED')
plt.title('Total LED per Person')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# We use the aggreagte function to find the min and max date of drugs being administered
patient_stay = led_df.groupby('person_id')['drug_exposure_start_datetime'].agg(['min', 'max']).reset_index()
# Compute the mean LED per patiient
mean_led_per_patient = led_df.groupby('person_id')['led'].mean().reset_index() # Mean LED per patient
mean_led_per_administration_date= led_df.groupby(['person_id', led_df['drug_exposure_start_datetime'].dt.date])['led'].mean().reset_index()
mean_led_per_drug = led_df.groupby(['person_id', led_df['generic_name']])['led'].mean().reset_index()
mean_led_per_visit = led_df.groupby(['person_id', led_df['visit_occurrence_id']])['led'].mean().reset_index()
# Assing the mean LED to the DataFrame
patient_stay['mean_led_per_patient'] = mean_led_per_patient['led']
patient_stay['mean_led_per_administration_date'] = mean_led_per_administration_date['led']
patient_stay['mean_led_per_drug'] = mean_led_per_drug['led']
patient_stay['mean_led_per_visit'] = mean_led_per_visit['led']
# Compute and add the patient stay in format days and hours, and patient stay in years
patient_stay['stay'] = patient_stay['max'] - patient_stay['min']
patient_stay['stay_in_years'] = (patient_stay['max'] - patient_stay['min']) / np.timedelta64(1, 'Y')
# Sort values by patient stay
patient_stay = patient_stay[['person_id', 'stay', 'stay_in_years', 'mean_led_per_patient', 'mean_led_per_administration_date', 'mean_led_per_drug', 'mean_led_per_visit' ]].sort_values(by=['stay_in_years'], ascending=False)

In [None]:
mean_led_per_patient

In [None]:
mean_led_per_administration_date

In [None]:
mean_led_per_drug

In [None]:
mean_led_per_visit

In [None]:
patient_stay

In [None]:
years_of_stay = patient_stay[patient_stay['stay_in_years'] >= 3]
years_of_stay_person_id = years_of_stay['person_id'].tolist()

In [None]:
len(years_of_stay)

In [None]:
mean_led_per_administration_date

In [None]:
len(years_of_stay_person_id)

Mean LED per administration date

In [None]:
mean_led_per_administration_date

In [None]:
mean_led_per_visit

In [None]:
diagnosis_date

In [None]:
led_df['diagnosis_date']

In [None]:
for x,y in diagnosis_date[diagnosis_date['person_id'] == 253410][['diagnosis_date', 'condition_poa']].values:
    print(x,y)

In [None]:
diagnosis_date[diagnosis_date['person_id'] == 212715]

In [None]:
led_df[led_df['person_id'] == years_of_stay_person_id[0]].sort_values(by='drug_exposure_start_datetime')

In [None]:
led_df

In [None]:
mean_led_per_administration_date[mean_led_per_administration_date['person_id'] == years_of_stay_person_id[0]].sort_values(by='drug_exposure_start_datetime')

In [None]:
diagnosis_date[diagnosis_date['person_id'].isin(years_of_stay_person_id)]

In [None]:
# pd.options.display.max_rows = 100

In [None]:
diagnosis_date

In [None]:
led_df.loc[led_df['person_id'] == 209423, ['generic_name', 'drug_exposure_start_datetime', 'dosage']].head(20)

In [None]:
visit_occurrence = pd.read_csv('data/CROSSOVER.csv') # I believe this has the H&P notes
visit_occurrence["visit_start_datetime"] = pd.to_datetime(visit_occurrence["visit_start_datetime"], format='%Y-%m-%d')

# visit_occurrence_inpatient = pd.read_csv('data/VISIT_OCCURRENCE_INPATIENT_HOSPITAL.csv')
# visit_occurrence_inpatient["visit_start_date"] = pd.to_datetime(visit_occurrence["visit_start_date"], format='%Y-%m-%d')

# visit_occurrence_outpatient = pd.read_csv('data/VISIT_OCCURRENCE_OUTPATIENT_VISIT.csv')
# visit_occurrence_outpatient["visit_start_date"] = pd.to_datetime(visit_occurrence["visit_start_date"], format='%Y-%m-%d')

In [None]:
visit_occurrence

In [None]:
import matplotlib.lines as mlines
# Create individual subplots for each patient
fig, axes = plt.subplots(len(years_of_stay_person_id), 1, figsize=(12, 6 * len(years_of_stay_person_id)))

for i, person_id in enumerate(years_of_stay_person_id):
    person_data = mean_led_per_administration_date[mean_led_per_administration_date['person_id'] == person_id]
    
    # Smooth out the LED values using a moving average
    window_size = 5  # Adjust this value to control the smoothing level
    smoothed_led = person_data['led'].rolling(window=window_size, min_periods=1).mean()
    
    ax = axes[i]
    
    # Plot the data points with dots
    ax.plot(person_data['drug_exposure_start_datetime'], person_data['led'], marker='x', markersize=4, label='Data Points') #-> Normal
    # ax.plot(person_data['drug_exposure_start_datetime'], smoothed_led, marker='x', markersize=4, label='Data Points')
    ax.set_title(f'Person ID {person_id}')
    
    # Set X-axis tick frequency and formatting
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_major_locator(mdates.YearLocator())
    
    for date, condition_poa in diagnosis_date[diagnosis_date['person_id'] == person_id][['diagnosis_date', 'condition_poa']].values:
        if condition_poa == True:
            ax.axvline(x=date, color='blue', linestyle='--', alpha=0.7, linewidth=2)  # Customize color, linestyle, and alpha as needed
        # The times where there's only one dotted blue line is becasue the earliest date is the POA date
        else:
            ax.axvline(x=date, color='red', linestyle='-.', alpha=0.7, linewidth=2)  # Dash-dot line

    for another_date in visit_occurrence[visit_occurrence['person_id'] == person_id]['visit_start_datetime'].values:
        ax.axvline(x=another_date, color='gray', linestyle=':', alpha=0.7, linewidth=1)

    # Get the generic name for the current person_id
    # generic_name = led_df[led_df['person_id'] == person_id]['generic_name'].values[0]
    
    # Add the generic name to the legend
    # ax.legend([f'LED ({generic_name})'], loc='upper left')
    
    # Create custom legend elements with specified colors
    poa_legend = mlines.Line2D([], [], color='blue', linestyle='--', label='Present On Admission')
    diagnosis_legend = mlines.Line2D([], [], color='red', linestyle='-.', label='Diagnosis Date')
    visit_legend = mlines.Line2D([], [], color='gray', linestyle=':', label='Visit')


    # Add the custom legend elements to the legend
    ax.legend(handles=[diagnosis_legend, poa_legend, visit_legend])

    ax.set_xlabel('Date')
    ax.set_ylabel('Mean LED per year')
    ax.grid(False)
    ax.tick_params(axis='x', rotation=45)
    
plt.tight_layout()
plt.show()

249161 Is another patient to follow up for further something

In [None]:
visit_occurrence['person_id'].nunique() == led_df['person_id'].nunique()

In [None]:
inpatient_hospital_ids = visit_occurrence[visit_occurrence['visit_concept_id'] == 8717]['person_id'].unique()
outpatient_hospital_ids = visit_occurrence[visit_occurrence['visit_concept_id'] == 8756]['person_id'].unique()

In [None]:
print(len(outpatient_hospital_ids))

In [None]:
led_df[led_df['person_id'].isin(outpatient_hospital_ids)]['generic_name'].unique()

Finding Home Medications

In [1200]:
from collections import Counter

count_comma = 0
count_asterisc = 0
comma_delimiter = False
target_word = 'Home Medications'

pattern = r'([\w\s-]+)\s(?:\(([\w\s-]+)\)\s*)'
dosage = r'\d+(?:\.\d+)?(?:-\d+(?:\.\d+)?)*(?:\s*(?:mg/ml|mg|ml|%))(?:/hr)?'
home_meds_pattern = r"Home Medications:? ([\w\s\d\(\)]+)"

generic_names= []
brand_names = []
target = []
home_meds_note = []
home_meds_portion = []
medications_appearance = ""
bleh = []
after_home_meds = []
rest_note = []
temp = list(set(corpus_clean))

for index, item in enumerate(temp): # Make sure to sure the whole array next
    match = re.search(home_meds_pattern, item)
    if  match:
        word_index = item.find(target_word) # Actual index where the word is inside of the string
        # Taking the whole note and looking at the words containing "Medication" after "Home Medication" to try to find a pattern
        home_meds_note.append(item[word_index:])
        home_meds_portion.append(match.group(0))
        bleh.append(index)

# rest_note = [item for item in home_meds_note if all(substring in item for substring in home_meds_portion)]
result = [note.replace(portion, "") for note, portion in zip(home_meds_note, home_meds_portion)]      
        # This is for later
        # count_comma = truncated_note.count(',')
        # count_asterisc = truncated_note.count('*')
        # after_home_meds.append()

# words = [word for word in enumerate(truncated_note.split(' '))]
words = [word.split(' ') for word in result]

for i, item in enumerate(words):
    for j, word in enumerate(item):
        if "Medications" in word:
            # Avoid getting the word at the end of the array
            if j-2 == -1:
                medications_appearance = " ".join([words[i][j-1], words[i][j], words[i][j+1], words[i][j+2]])
            else:
                medications_appearance = " ".join([words[i][j-2], words[i][j-1], words[i][j], words[i][j+1], words[i][j+2]])
            
            target.append(medications_appearance)
            # if not medications_appearance.startswith("caused by Medications"):
            #     target.append(medications_appearance)
        multi_word_phrase = " ".join(item[j:j+3])  # Adjust the slice range as needed
        multi_word_phrase_other = " ".join(item[j:j+2])
        if 'Allergies Allergen Reactions' in multi_word_phrase:
            target.append(multi_word_phrase)
        
        if 'Family History' in multi_word_phrase_other:
            target.append(multi_word_phrase_other)


element_counts = Counter(target)

sorted_target = sorted(target, key=lambda x: (-element_counts[x], x))
target_unique = []
seen = set()

for element in sorted_target:
    if element not in seen:
        target_unique.append(element)
        seen.add(element)


whats = []
ble = []
for item in result:
    
    for stop_word in target_unique:
        if stop_word in item:
            word_index = item.find(stop_word)
            epa = item[:word_index]
            count_comma = epa.count(',')
            count_asterisc = epa.count('*')
            whats.append(epa)
            break
        # Revisar esto
        comma_delimiter = False    
        if count_comma > count_asterisc:
            comma_delimiter = True
        ble.append(comma_delimiter)
        
# delimiter = ',' if comma_delimiter else '*'
# potential_drugs = epa.split(delimiter)
for index, item in enumerate(whats):
    delimiter = ',' if ble[index] else '*'
    potential_drugs = item.split(delimiter)
    for indo, string in enumerate(potential_drugs):
        drug_name_match = re.findall(pattern, string)
        if drug_name_match:
            # I have to use temp variables beecause tuples are not mutable in Python
            for match in drug_name_match:
                if "Authorizing Provider" in match[0]:
                    temp_match = match[0].replace("Authorizing Provider", "")
                elif "Historical" in match[0]:
                    temp_match = match[0].replace("Historical", "")
                else:
                    temp_match = match[0]
                updated_match = (temp_match, match[1])
                # print(match, updated_match)
                generic_name, brand_name = updated_match
                generic_names.append(generic_name)
                if brand_name:
                    brand_names.append(brand_name)
                else:
                    brand_names.append(np.nan)

(' amLODIPine', 'NORVASC') (' amLODIPine', 'NORVASC')
(' atorvastatin', 'LIPITOR') (' atorvastatin', 'LIPITOR')
(' benztropine', 'COGENTIN') (' benztropine', 'COGENTIN')
(' carvedilol', 'COREG') (' carvedilol', 'COREG')
(' haloperidol', 'HALDOL') (' haloperidol', 'HALDOL')
(' insulin glargine', 'LANTUS SOLOSTAR') (' insulin glargine', 'LANTUS SOLOSTAR')
(' LIQD Take 60 mLs by mouth 2 times daily', 'after meals') (' LIQD Take 60 mLs by mouth 2 times daily', 'after meals')
(' ticagrelor', 'BRILINTA') (' ticagrelor', 'BRILINTA')
(' Authorizing Provider amitriptyline', 'ELAVIL') ('  amitriptyline', 'ELAVIL')
('  apixaban', 'ELIQUIS') ('  apixaban', 'ELIQUIS')
('  budesonide-formoterol', 'SYMBICORT') ('  budesonide-formoterol', 'SYMBICORT')
('  carbidopa-levodopa', 'SINEMET') ('  carbidopa-levodopa', 'SINEMET')
('  cinacalcet', 'SENSIPAR') ('  cinacalcet', 'SENSIPAR')
('90 MG Tablet Take 1 tablet by mouth daily', 'with dinner') ('90 MG Tablet Take 1 tablet by mouth daily', 'with dinner')
('

len(brand_names)

In [1101]:
whats[0].split('*')

['',
 ' amLODIPine (NORVASC) 5 MG tablet Take 5 mg by mouth daily. ',
 ' aspirin 81 MG tablet Take  by mouth daily. ',
 ' atorvastatin (LIPITOR) 20 MG tablet Take 1 Tablet by mouth nightly. Indications: Type II B Hyperlipidemia  30 Tablet  0 ',
 ' benztropine (COGENTIN) 1 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Extrapyramidal Reaction caused by Medications  60 Tablet  0 ',
 ' carvedilol (COREG) 12.5 MG tablet Take 12.5 mg by mouth 2 times daily. ',
 ' haloperidol (HALDOL) 10 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Schizophrenia  60 Tablet  0 ',
 ' insulin glargine (LANTUS SOLOSTAR) 100 UNIT/ML injection 20 units SQ in the AM and 25 units SQ in the PM  15 mL  2 ',
 ' lisinopril (PRINIVIL,ZESTRIL) 20 MG tablet Take 20 mg by mouth daily. ',
 ' potassium chloride (K-DUR,KLOR-CON) 20 MEQ tablet Take 20 mEq by mouth daily. ',
 ' protein (PROTEINEX, PRO-STAT) LIQD Take 60 mLs by mouth 2 times daily (after meals).  900 mL  3 ',
 ' ticagrelor (BRILINTA)

In [1110]:
drug_name_match

[]

In [1085]:
whats

['* amLODIPine (NORVASC) 5 MG tablet Take 5 mg by mouth daily. * aspirin 81 MG tablet Take  by mouth daily. * atorvastatin (LIPITOR) 20 MG tablet Take 1 Tablet by mouth nightly. Indications: Type II B Hyperlipidemia  30 Tablet  0 * benztropine (COGENTIN) 1 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Extrapyramidal Reaction caused by Medications  60 Tablet  0 * carvedilol (COREG) 12.5 MG tablet Take 12.5 mg by mouth 2 times daily. * haloperidol (HALDOL) 10 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Schizophrenia  60 Tablet  0 * insulin glargine (LANTUS SOLOSTAR) 100 UNIT/ML injection 20 units SQ in the AM and 25 units SQ in the PM  15 mL  2 * lisinopril (PRINIVIL,ZESTRIL) 20 MG tablet Take 20 mg by mouth daily. * potassium chloride (K-DUR,KLOR-CON) 20 MEQ tablet Take 20 mEq by mouth daily. * protein (PROTEINEX, PRO-STAT) LIQD Take 60 mLs by mouth 2 times daily (after meals).  900 mL  3 * ticagrelor (BRILINTA) 90 MG tablet Take 1 Tablet by mouth 2 times

In [1062]:
result

['* amLODIPine (NORVASC) 5 MG tablet Take 5 mg by mouth daily. * aspirin 81 MG tablet Take  by mouth daily. * atorvastatin (LIPITOR) 20 MG tablet Take 1 Tablet by mouth nightly. Indications: Type II B Hyperlipidemia  30 Tablet  0 * benztropine (COGENTIN) 1 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Extrapyramidal Reaction caused by Medications  60 Tablet  0 * carvedilol (COREG) 12.5 MG tablet Take 12.5 mg by mouth 2 times daily. * haloperidol (HALDOL) 10 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Schizophrenia  60 Tablet  0 * insulin glargine (LANTUS SOLOSTAR) 100 UNIT/ML injection 20 units SQ in the AM and 25 units SQ in the PM  15 mL  2 * lisinopril (PRINIVIL,ZESTRIL) 20 MG tablet Take 20 mg by mouth daily. * potassium chloride (K-DUR,KLOR-CON) 20 MEQ tablet Take 20 mEq by mouth daily. * protein (PROTEINEX, PRO-STAT) LIQD Take 60 mLs by mouth 2 times daily (after meals).  900 mL  3 * ticagrelor (BRILINTA) 90 MG tablet Take 1 Tablet by mouth 2 times

In [1063]:
whats

['* amLODIPine (NORVASC) 5 MG tablet Take 5 mg by mouth daily. * aspirin 81 MG tablet Take  by mouth daily. * atorvastatin (LIPITOR) 20 MG tablet Take 1 Tablet by mouth nightly. Indications: Type II B Hyperlipidemia  30 Tablet  0 * benztropine (COGENTIN) 1 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Extrapyramidal Reaction caused by Medications  60 Tablet  0 * carvedilol (COREG) 12.5 MG tablet Take 12.5 mg by mouth 2 times daily. * haloperidol (HALDOL) 10 MG tablet Take 1 Tablet by mouth 2 times daily. Indications: Schizophrenia  60 Tablet  0 * insulin glargine (LANTUS SOLOSTAR) 100 UNIT/ML injection 20 units SQ in the AM and 25 units SQ in the PM  15 mL  2 * lisinopril (PRINIVIL,ZESTRIL) 20 MG tablet Take 20 mg by mouth daily. * potassium chloride (K-DUR,KLOR-CON) 20 MEQ tablet Take 20 mEq by mouth daily. * protein (PROTEINEX, PRO-STAT) LIQD Take 60 mLs by mouth 2 times daily (after meals).  900 mL  3 * ticagrelor (BRILINTA) 90 MG tablet Take 1 Tablet by mouth 2 times

In [1038]:
target_unique

['Current Hospital Medications: * carbidopa-levodopa',
 'daily. Inpatient Medications: * benztropine',
 'Current Inpatient Medications: Scheduled: *',
 '1433 Facility-Administered Medications Ordered in',
 'package. Inpatient Medications: * Adult',
 'Current Facility-Administered Medications: ?* ',
 'daily. Inpatient Medications: ?* acetaminophen',
 'Current Hospital Medications: * aspirin',
 'days. Inpatient Medications: Current Facility-Administered',
 'of hyperglygemia Medications: Insulin NPH',
 'Current Facility-Administered Medications: * ',
 'Current Hospital Medications: Current Facility-Administered',
 ' Inpatient Medications: Scheduled: ',
 'linezolid.  Medications reviewed and',
 'of hyperglycemia. Medications: Regular Insulin.',
 'Management: Current Medications: 70/30 mix',
 'bedtime. Current Medications: * acetaminophen',
 'Current Hospital Medications: * amiodarone',
 'Antimicrobials: Hospital Medications  ',
 'of Staphylococcus Medications: * amLODIPine',
 ' Hospital Me

In [1039]:
target

['Current Hospital Medications: Scheduled: *',
 'Current Hospital Medications: Scheduled: *',
 'Current Hospital Medications: * amantadine',
 ' Inpatient Medications: Scheduled: *',
 'Current Hospital Medications: * aspirin',
 'Current Inpatient Medications: Scheduled: *',
 'Current Hospital Medications: Scheduled: *',
 'Current Hospital Medications: Scheduled: Allergies',
 'Current Hospital Medications: Scheduled: *',
 ' Inpatient Medications: Scheduled: *',
 'Current Inpatient Medications: Scheduled: *',
 'Current Inpatient Medications: Scheduled: Continuous',
 'Current Hospital Medications: Scheduled: *',
 'Current Inpatient Medications: Scheduled: *',
 'Current Hospital Medications: Scheduled: *',
 '-Administered Medications Medication Dose',
 'Current Hospital Medications: Scheduled: *',
 'Current Hospital Medications: Scheduled: *',
 'Current Hospital Medications: @MEDSSCHEDULEDANE@ @MEDSINFUSIONSANE@',
 ' Inpatient Medications: Scheduled: *',
 'Current Hospital Medications: Sche

In [985]:
target = 0
target_unique = 0

In [987]:
target_unique

['Allergies Allergen Reactions',
 'Family History',
 'Current Hospital Medications: * carbidopa-levodopa',
 'daily. Inpatient Medications: * benztropine',
 'Current Inpatient Medications: Scheduled: *',
 '1433 Facility-Administered Medications Ordered in',
 'package. Inpatient Medications: * Adult',
 'Current Facility-Administered Medications: ?* ',
 'daily. Inpatient Medications: ?* acetaminophen',
 'Current Hospital Medications: * aspirin',
 'days. Inpatient Medications: Current Facility-Administered',
 'of hyperglygemia Medications: Insulin NPH',
 'Current Facility-Administered Medications: * ',
 'Current Hospital Medications: Current Facility-Administered',
 ' Inpatient Medications: Scheduled: ',
 'linezolid.  Medications reviewed and',
 'of hyperglycemia. Medications: Regular Insulin.',
 'Management: Current Medications: 70/30 mix',
 'bedtime. Current Medications: * acetaminophen',
 'Current Hospital Medications: * amiodarone',
 'Antimicrobials: Hospital Medications  ',
 'of Staph

In [None]:
extracted_strings = []
for item in result:
    if item in target:
        break
    extracted_strings.append(item)

In [None]:
extracted_strings

In [None]:
words

In [None]:
home_meds_note

In [None]:
result

In [None]:
pattern = r"Home Medications:? ([\w\s\d\(\)]+)"

# Create a set to store unique variations
unique_variations = set()

# Iterate through the list and extract variations using regex
for variation in notes_dict:
    match = re.search(pattern, variation)
    if match:
        print(match.group(0))
        unique_variations.add(match.group(0))

# Convert the set back to a list if needed
unique_variations_list = list(unique_variations)

# Print the unique variations
# for variation in unique_variations_list:
#     print(variation)

In [None]:
unique_variations_list

In [None]:
list(set(notes_dict))

In [None]:
bleh[:6]

Allergies Allergen Reactions, Physical Exam, Family History, Allergies Allergen Reactions

In [None]:
corpus_clean[bleh[60]]

In [None]:
set(corpus_clean)

In [None]:
unique_visits = visit_occurrence['visit_concept_id'].unique()
label = {
    8717: 'Inpatient Hospital - Visit',
    8756: 'Outpatient Hospital - Visit',
    4004517: 'Ambulatory surgery - Procedure',
    8870: 'Emergency Room (Hospital) - Visit',
    581477: 'Office Visit - Visit',
    4203722: 'Patient encounter procedure - Procedure',
    44791812: 'Clinical support - Procedure',
    38004515: 'Hospital - Visit',
    0: 'Ungroupable - MDC',
    9202: 'Outpatient Visit - Visit'
}
visit_label = [word.split('-')[0] for word in list(label.values())] 
visit_type = [word.split('-')[1] for word in list(label.values())] 
value_counts = visit_occurrence['visit_concept_id'].value_counts(normalize=True)

# Create a bar plot
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
bars = plt.bar(visit_label, value_counts)

# Adding labels and title
plt.xlabel('visit_concept_id')
plt.ylabel('Normalized Count (Log scale)')
plt.title('Distribution of Patient encounters')
plt.yscale('log')

for bar, v_type in zip(bars, visit_type):
    x_pos = bar.get_x() + bar.get_width() / 2 # Adjusted x-coordinate
    y_pos = bar.get_height() / 0.9
    plt.text(x_pos, y_pos, v_type, ha='center')

# Show the plot
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
visit_occurrence.groupby('person_id')['visit_start_datetime'].nunique().describe()

In [None]:
visit_occurrence.loc[visit_occurrence['person_id'] == 175605, ['visit_start_datetime', 'visit_occurrence_id']].head(15)

In [None]:
len(years_of_stay_person_id)

In [None]:
led_df = led_df.sort_values(by=['drug_exposure_start_datetime'], ascending=True)

In [None]:
# This person probably had restless leg syndrome or Parkinsonism
led_df.loc[led_df['person_id'] == 258899, ['diagnosis_date', 'drug_exposure_start_datetime', 'generic_name']]

In [None]:
mean_led_per_administration_date

In [None]:
mean_led_per_administration_date.describe()

In [None]:
led_df.groupby('person_id')['visit_occurrence_id'].nunique().sort_values(ascending=False)

In [None]:
led_df['drug_source_value'][0]