# Characterization of Parkinson's Disease through clustering of Medical notes

## Introduction

### Importing packages

In [None]:
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import RegexpTokenizer
from gap_statistic import OptimalK

## Data Pre-processing

### Exploring the data

In [None]:
path = "data/PD.csv"
chunksize = 50_000_000 
chunk = pd.read_csv(path, chunksize=chunksize)
df = pd.concat(chunk)
df.sample(10)

Unnamed: 0,person_id,birth_datetime,gender_source_value,race_source_value,ethnicity_source_value,note_title,note_text
0,271,1932-02-09 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 9:36 AM Geriat..."
1,1798,1928-06-16 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 2:57 PM Departm..."
2,1798,1928-06-16 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 7:32 PM Departm..."
3,1798,1928-06-16 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 10:44 PM Geriat..."
4,1798,1928-06-16 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 5:13 PM ..."
...,...,...,...,...,...,...,...
4242,258985,1946-12-18 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 7:36 AM Departm..."
4243,258985,1946-12-18 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], RN [**DATE**] 12:12 PM Referra..."
4244,258985,1946-12-18 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], PA-C [**DATE**] 7:11 PM Depa..."
4245,258985,1946-12-18 00:00:00,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 10:26 PM Pt seen..."


### Calculating the age of each patient from their birth date

In [None]:
# Converting the "birth_datetime" column to a datetime format 
df["birth_datetime"] = pd.to_datetime(df["birth_datetime"], format='%Y-%m-%d')
# Calculating the age by substracting the current date with the date in the dataframe
df["age"] = (pd.Timestamp('now') - df["birth_datetime"]).astype('timedelta64[Y]')
# Converting age to integer
df["age"]  = pd.to_numeric(df["age"], downcast='integer')
# Dropping the birth_datetime (it's useless at this point) 
df = df.drop("birth_datetime", axis=1)
# Re-arranging the order of the columns
df = df[['person_id', 'age', 'gender_source_value', 'race_source_value', 'ethnicity_source_value', 'note_title', 'note_text']]
df

Unnamed: 0,person_id,age,gender_source_value,race_source_value,ethnicity_source_value,note_title,note_text
0,271,90,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 9:36 AM Geriat..."
1,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 2:57 PM Departm..."
2,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 7:32 PM Departm..."
3,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 10:44 PM Geriat..."
4,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], MD [**DATE**] 5:13 PM ..."
...,...,...,...,...,...,...,...
4242,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 7:36 AM Departm..."
4243,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], RN [**DATE**] 12:12 PM Referra..."
4244,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"""[**NAME**], PA-C [**DATE**] 7:11 PM Depa..."
4245,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,"[**NAME**], MD [**DATE**] 10:26 PM Pt seen..."


#### Some statistics on the age of the patients

In [None]:
df['age'].describe()

count    4247.000000
mean       78.561808
std        11.600322
min        26.000000
25%        65.000000
50%        86.000000
75%        88.000000
max        99.000000
Name: age, dtype: float64

### Creating the corpus from medical notes 

#### Using the note_text column for raw data

In [None]:
corpus_raw = list(df["note_text"])

#### Removing words that don't contain much meaning from our notes

In [None]:
words_to_remove = ["Department of Neurosurgery Date of Consult", "Department of Orthopedics Consultation Note Date of Consult", "Geriatric Medicine Consult Date of Consult", "INPATIENT MEDICAL NUTRITION THERAPY", "MSW", "RN" ,"evidence", "Read By", "images", "report", "concur", "findings", "agree", "seen", "residents", "resident", "Resident", "unspecified provider", "Released Date Time", "personally reviewed" ,"D.O", "MD", "M.D.", "Electronically Verified By", "NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]
# words_to_remove = ["NAME:", "[**NAME**]", "EXAM DATE:", "[**DATE**]", "LOC:", "[**LOCATION_INSTITUTE**]", "[**LOCATION_STREET**]", "[**LOCATION_ZIP**]", "[**LOCATION_CITY**]", "[**CONTACT_PHONE**]", "[**LOCATION_OTHER**]", "MRN:", "[**ID**]", "DOB:", "** VERIFIED **", "ORDERING MD:", "ORDER:", "ORD. SERVICE:", "ORD. LOC:", "TECH", "RMS# / INV#:"]

# words_to_remove = words_to_remove + list(STOPWORDS)
corpus_clean = []
for item in corpus_raw:
    for word in words_to_remove:
        item = item.replace(word, '') 
    corpus_clean.append(item)

In [None]:
df['note_text'].iloc[989]

"[**NAME**], MSW     [**DATE**] 11:22 AM Consult received.  Patient's preference is for Lakeside or St [**NAME**]"

#### Taking each word from the cleaned corpus and making it lowercase

In [None]:
corpus = [word.lower() for word in corpus_clean]
corpus[989]

",       11:22 am consult received.  patient's preference is for lakeside or st "

#### Adding the pre-processed version of the notes to the DataFrame

In [None]:
df["note_text"] = corpus
df

Unnamed: 0,person_id,age,gender_source_value,race_source_value,ethnicity_source_value,note_title,note_text
0,271,90,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,""", 9:36 am : current patient location:..."
1,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,", 2:57 pm department of radiology divis..."
2,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,", 7:32 pm department of surgery date o..."
3,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,""", 10:44 pm : subjective: reason for co..."
4,1798,94,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,""", 5:13 pm rheumatology in..."
...,...,...,...,...,...,...,...
4242,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,", 7:36 am : requesting physician: s..."
4243,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,", 12:12 pm referral made to wellcare con..."
4244,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,""", pa-c 7:11 pm : subjective: reason ..."
4245,258985,75,FEMALE,WHITE,NOT HISPANIC,order_narative: CONSULT,", 10:26 pm pt and examined. full h&p t..."
