## MIMIC NLP Assignment
### Read the MIMIC-III files ([DIAGNOSES_ICD](https://physionet.org/content/mimiciii/1.4/DIAGNOSES_ICD.csv.gz), [NOTEEVENTS](https://physionet.org/content/mimiciii/1.4/NOTEEVENTS.csv.gz), [D_ICD_DIAGNOSES](https://physionet.org/content/mimiciii/1.4/D_ICD_DIAGNOSES.csv.gz))

Note: After download, make sure to unzip the files. 

1. Install pandas

In [None]:
! pip install pandas

2. Read the downloaded files into the dataframe. 

In [None]:
import pandas as pd

diag_df = pd.read_csv('DIAGNOSES_ICD.csv')
notes_df = pd.read_csv('NOTEEVENTS.csv')
d_diag_df = pd.read_csv('D_ICD_DIAGNOSES.csv')

3. Find the most common diagnosed disease and convert to a list

In [None]:
diag_df.info()
disease = diag_df['ICD9_CODE'].value_counts().nlargest(1)
# Create a list of the top 5 diseases
disease_list = disease.index.tolist()
print(disease_list)

4. Create a new CSV (disease_notes.csv) file with the notes belonging to the disease list

In [None]:
disease_df = diag_df[diag_df['ICD9_CODE'].isin(disease_list)]

# Merge with notes_df to get the text of the notes
merged_df = pd.merge(disease_df, notes_df, on='SUBJECT_ID', how='inner').dropna()
merged_df.to_csv('disease_notes.csv', index=False)

## Spacy Assignment

- Install spacy and the model

In [None]:
! pip install -U pip setuptools wheel
! pip install -U spacy
! python -m spacy download en_core_web_sm

- Preprocessing function for the clinical notes to reduce the noise. 

In [None]:

import re
import string

def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    
    # remove, digits, spaces
    y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

- Load the model and read the medical notes from the disease_notes.csv file into the untrained model, after running it through the preprocessing function

In [None]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")
merged_notes_df = pd.read_csv('disease_notes.csv')
merged_notes_df = preprocessing(merged_notes_df)
notes = merged_notes_df['TEXT'].tolist()
print(len(notes))

- Print each note

In [None]:
doc = []
for i in range(len(notes)):
  doc.append(nlp(notes[i]))
  print(doc[-1])
  print('*************************************************************************************************************')

- Tokenisation

In [None]:
for i in range(len(doc)):
    for token in doc[i]:
        print(token.text, token.pos_)
    print('**************************************************')

In [None]:
# Token
token_without_punct = []
for i in range(len(doc)):
  token_without_punct.append([token.orth_ for token in doc[i] if not token.is_punct | token.is_space])
  print(token_without_punct[-1])
  print('*******************************************************************************************')

- Named Entity Recognition (NER)

In [None]:
# Named Entity Recognition
for i in range(len(doc)):
    for ent in doc[i].ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print('**************************************************')


- Entity Visualization

In [None]:
# Entity Visualizer
from spacy import displacy
for i in range(len(doc)):
  displacy.render(doc[i], style="ent", jupyter=True)
  print('*********************************************************************************************************************************************************************')

In [None]:
#sentence identifier
for i in range(len(doc)):
  for ix, sent in enumerate(doc[i].sents, 1):
    print("Sentence number {}:{}".format(ix, sent))
  print('*******************************************************************************************')

In [None]:
# dependence tree
for i in range(len(doc)):
  sentence_spans = list(doc[i].sents)
  displacy.render(sentence_spans, style="dep", jupyter=True)

## SciSpacy Assignment

- Install scispacy and the models

In [None]:
#install Scispacy
! pip install -U spacy
! pip install scispacy

! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_craft_md-0.5.4.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz

- Load the untrained model, read the clinical notes and preprocess them from disease_notes.csv

In [None]:
import spacy
import pandas as pd
import en_core_sci_md

nlp = en_core_sci_md.load()
merged_notes_df = pd.read_csv('disease_notes.csv')

merged_notes_df = preprocessing(merged_notes_df)

notes = merged_notes_df['TEXT'].tolist()
print(len(notes))



- Print the notes

In [None]:
doc = []
for i in range(len(notes)):
  doc.append(nlp(notes[i]))
  print(doc[-1])
  print('*************************************************************************************************************')

- Tokenization

In [None]:
for i in range(len(doc)):
    for token in doc[i]:
        print(token.text, token.pos_)
    print('**************************************************')

In [None]:
# Token
token_without_punct = []
for i in range(len(doc)):
  token_without_punct.append([token.orth_ for token in doc[i] if not token.is_punct | token.is_space])
  print(token_without_punct[-1])
  print('*******************************************************************************************')

- Named Entity Recognition (NER)

In [None]:
doc = []
for i in range(len(notes)):
  doc.append(nlp(notes[i]))
  for ent in doc[-1].ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
  print("****************************************************************************************************")

- Entity Visualization

In [None]:
# Entity Visualizer
from spacy import displacy
for i in range(len(doc)):
  displacy.render(doc[i], style="ent", jupyter=True)
  print("************************************************************************************************************************************************")

In [None]:
import en_ner_bc5cdr_md
nlp = en_ner_bc5cdr_md.load()
doc = []
for i in range(len(notes)):
  doc.append(nlp(notes[i]))
  displacy.render(doc, style="ent", jupyter=True)
  print("*******************************************************************************************************************************************")

## Word2Vec Assignment

- Install gensim and matplotlib

In [None]:
! pip install gensim
! pip install matplotlib

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

- Load the CORE untrained model and read in the disease_notes.csv file after preprocessing

In [None]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")
merged_notes_df = pd.read_csv('disease_notes.csv')
merged_notes_df = preprocessing(merged_notes_df)
notes = merged_notes_df['TEXT'].tolist()
print(len(notes))

- Build corpus

In [None]:
# Build corpus of all the entities extracted from the notes using spaCy model.
# The corpus is an array of arrays or list of lists where each of the nested lists corresponds to a note.
corpus=[]
for row in range(0, len(notes)):
  str_tokens=[]
  tokens= nlp(notes[row]).ents
  for i in range(0, len(tokens)):
    str_tokens.append(tokens[i].text)
  corpus.append(list(str_tokens))


print(corpus)

- Create word2vec embeddings

In [None]:
from gensim.models import Word2Vec
model1 = Word2Vec(corpus, min_count=1)

In [None]:
model1.wv.key_to_index

In [None]:
model1.wv['fentanyl']

In [None]:
model1.wv.most_similar('fentanyl', topn=10)

## TSNE plots  

In [None]:
def tsne_plot(model,words, preTrained=False):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in words:
      if preTrained:
          tokens.append(model[word])
      else:
          tokens.append(model.wv[word])
      labels.append(word)

    tokens = np.array(tokens)
    tsne_model = TSNE(perplexity=30, early_exaggeration=12, n_components=2, init='pca', n_iter=1000, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

## tSNE plots for the **untrained** Model
Notice how noisy the scatter plot is

In [None]:
vocabs = model1.wv.key_to_index.keys()
new_v = np.array(list(vocabs))
tsne_plot(model1,new_v)

## tSNE plot for a **trained** model
Notice the word embeddings are better clustered together. 

In [None]:
# load pre-trained word2vec embeddings
import gensim
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
pretrained_model= api.load("glove-wiki-gigaword-50")  # download the model and return as object ready for use

In [None]:
pretrained_model.most_similar("heart")

In [None]:
corpus_in_pretrained_model = []
for word in vocabs:
  if word in pretrained_model:
    corpus_in_pretrained_model.append(word)
  else:
    print(word) #

In [None]:
tsne_plot(pretrained_model,corpus_in_pretrained_model,True)

## MedSpacy Assignment

- Install medspacy and import the relevant modules

In [None]:
! pip install medspacy

In [None]:
import sys
import spacy
import medspacy

from medspacy.ner import TargetMatcher, TargetRule
from medspacy.visualization import visualize_ent, visualize_dep

- Load the model

In [None]:
import pandas as pd
nlp = spacy.load("en_core_web_sm", disable={"ner"})
nlp = medspacy.load(nlp=nlp)

merged_notes_df = pd.read_csv('disease_notes.csv')
merged_notes_df = preprocessing(merged_notes_df)
notes = merged_notes_df['TEXT']
nlp.pipe_names


- Add target rules into the pipe

In [None]:
from medspacy.ner import TargetRule

nlp.get_pipe('medspacy_target_matcher').add([TargetRule('Hypertension', 'CONDITION'), 
                                             TargetRule('heart failure', 'CONDITION'), 
                                             TargetRule('pna', 'CONDITION'), 
                                             TargetRule('diabetes', 'CONDITION'), 
                                             TargetRule('stroke', 'CONDITION'), 
                                             TargetRule('hx of stroke', 'CONDITION'), 
                                             TargetRule('hx of diabetes', 'CONDITION')])
# doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')
for i in range(len(notes)):
    doc = nlp(notes[i])
    print('*************************************************************************************************************')
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print('*************************************************************************************************************')



- Visualize

In [None]:
for i in range(len(notes)):               
    medspacy.visualization.visualize_ent(nlp(notes[i]))