<a href="https://colab.research.google.com/github/spyrosviz/Medical-Specialty-Classification-from-medical-transcription-text-NLP/blob/main/Text_Classification_NLP_clinical_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Medical Transcriptions**


In [None]:
# Import dependencies

import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

In [None]:
# Download Wordnet Lemmatizer
nltk.download(['wordnet','stopwords','omw-1.4'])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### **Parse csv and explore data**

In [None]:
# Parse csv data

df = pd.read_csv('mtsamples.csv')
df.head(5)
df.info()

for medical_specialty in df['medical_specialty'].unique():
  print(medical_specialty,len(medical_specialty),len(df[df['medical_specialty']==medical_specialty]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         4999 non-null   int64 
 1   description        4999 non-null   object
 2   medical_specialty  4999 non-null   object
 3   sample_name        4999 non-null   object
 4   transcription      4966 non-null   object
 5   keywords           3931 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB
 Allergy / Immunology 21 7
 Bariatrics 11 18
 Cardiovascular / Pulmonary 27 372
 Neurology 10 223
 Dentistry 10 27
 Urology 8 158
 General Medicine 17 259
 Surgery 8 1103
 Speech - Language 18 9
 SOAP / Chart / Progress Notes 30 166
 Sleep Medicine 15 20
 Rheumatology 13 10
 Radiology 10 273
 Psychiatry / Psychology 24 53
 Podiatry 9 47
 Physical Medicine - Rehab 26 21
 Pediatrics - Neonatal 22 70
 Pain Management 16 62
 Orthopedic 11 355
 Ophthalmology 14 83
 O

**Medical specialty will be the target variable. Because some categories are imbalanced compared to others and in order to reduce computations, we will try to make a model to predict Cardiovascular / Pulmonary, Gastroenterology and Neurology given the transcription text. We are also going to drop rows containing N/A values in the transcription column**

### **Preprocess data and create X and y variables**

In [None]:
df.drop('keywords',axis=1,inplace=True)
df.dropna(axis=0,inplace=True)
# medical_specialty column values have an extra whitespace character, so we apply a lambda function to remove it
df['medical_specialty'] = df['medical_specialty'].apply(lambda x: x.strip())
df = df[['transcription','medical_specialty']]
specialties = ['Neurology','Cardiovascular / Pulmonary','Gastroenterology']
df = df.loc[df['medical_specialty'].isin(specialties)]
print(df.head(5))

                                        transcription  \
3   2-D M-MODE: , ,1.  Left atrial enlargement wit...   
4   1.  The left ventricular cavity size and wall ...   
7   2-D ECHOCARDIOGRAM,Multiple views of the heart...   
9   DESCRIPTION:,1.  Normal cardiac chambers size....   
11  2-D STUDY,1. Mild aortic stenosis, widely calc...   

             medical_specialty  
3   Cardiovascular / Pulmonary  
4   Cardiovascular / Pulmonary  
7   Cardiovascular / Pulmonary  
9   Cardiovascular / Pulmonary  
11  Cardiovascular / Pulmonary  


In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text(text,lemmatize=False,output_is_tokenized=True):

  # Remove punctuation if any
  text_punct_removed = ''.join([char for char in text if char not in string.punctuation])
  # Split document into tokens
  tokens = re.split('\W+',text_punct_removed.lower())
  # Remove stopwords
  tokens_without_stopwords = [token for token in tokens if token not in nltk.corpus.stopwords.words('english')]
  # Lemmatize tokens
  if lemmatize == True:
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens_without_stopwords]
    if output_is_tokenized:
      return lemmatized_tokens
    else:
      return ' '.join(lemmatized_tokens)
  else:
    if output_is_tokenized:
      return tokens_without_stopwords
    else:
      return ' '.join(tokens_without_stopwords)

## **1. TFIDF vectorizer**

In [None]:
# Prepare data for tfidf

df['cleaned_transcription'] = df['transcription'].apply(lambda x: clean_text(x,lemmatize=True))
df.head(5)

Unnamed: 0,transcription,medical_specialty,cleaned_transcription
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,2d mmode 1 left atrial enlargement left atrial...
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,1 left ventricular cavity size wall thickness ...
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,2d echocardiogrammultiple view heart great ves...
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,description1 normal cardiac chamber size2 norm...
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,2d study1 mild aortic stenosis widely calcifie...


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['cleaned_transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

X_train[:10]

4878    reason referral chest pain possible syncopal s...
2878    history present illness patient 61yearold righ...
3542    procedure flexible sigmoidoscopypreoperative d...
4636    operative procedure1 thromboendarterectomy rig...
4775    preoperative diagnoses1 acute coronary artery ...
4880    preoperative diagnosis critical left carotid s...
4849    subjective review medical record show patient ...
4940    procedure diagnostic fiberoptic bronchoscopyan...
2896    report 18channel recording obtained using stan...
4871    discovered new twave abnormality ekg course fo...
Name: cleaned_transcription, dtype: object

In [None]:
# Fit vectorizer to X_train and transform X_train and X_test

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(X_train)
X_train_vector = tfidf.transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [None]:
# Create Random Forest model and test accuracy

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1)
rf.fit(X_train_vector,y_train.values)
y_pred = rf.predict(X_test_vector)
accuracy = round(accuracy_score(y_test,y_pred) * 100,1)
print(f'TFIDF accuracy is {accuracy}%')

TFIDF accuracy is 89.6%


## **2. WordVector**

In [None]:
# Prepare data for wordvector

df['cleaned_transcription'] = df['transcription'].apply(lambda x: clean_text(x,lemmatize=False,output_is_tokenized=True))
df.head(5)

Unnamed: 0,transcription,medical_specialty,cleaned_transcription
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"[2d, echocardiogrammultiple, views, heart, gre..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"[description1, normal, cardiac, chambers, size..."
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"[2d, study1, mild, aortic, stenosis, widely, c..."


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['cleaned_transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

X_train[:10]

4878    [reason, referral, chest, pain, possible, sync...
2878    [history, present, illness, patient, 61yearold...
3542    [procedure, flexible, sigmoidoscopypreoperativ...
4636    [operative, procedure1, thromboendarterectomy,...
4775    [preoperative, diagnoses1, acute, coronary, ar...
4880    [preoperative, diagnosis, critical, left, caro...
4849    [subjective, review, medical, record, shows, p...
4940    [procedure, diagnostic, fiberoptic, bronchosco...
2896    [report, 18channel, recording, obtained, using...
4871    [discovered, new, twave, abnormalities, ekg, c...
Name: cleaned_transcription, dtype: object

In [None]:
!pip install gensim
import gensim
import numpy as np

wv_model = gensim.models.Word2Vec(X_train,size=100,window=5,min_count=2)

vocabulary_learned = wv_model.wv.index2word

x_sets = [X_train,X_test]
x_sets_vectorized = []

for x_set in x_sets:

    # Transform each word into vector with size 100 for each transcription in the train set
  wv_vector = np.array([
      np.array([wv_model.wv[word] for word in transcription if word in vocabulary_learned]
              ) for transcription in x_set])

  # Average each 100 sized word vectors per transcription

  wv_vector_avg = []

  for transcr_vec in wv_vector:

    if len(transcr_vec)!=0:
      transcr_vec_avg = transcr_vec.mean(axis=0)
      wv_vector_avg.append(transcr_vec_avg)
    else:
      wv_vector_avg.append(np.zeros(100))

  x_sets_vectorized.append(wv_vector_avg)

X_train_vect = x_sets_vectorized[0]
X_test_vect = x_sets_vectorized[1]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  wv_vector = np.array([


In [None]:
rf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1)
rf.fit(X_train_vect,y_train.values)
y_pred = rf.predict(X_test_vect)
accuracy = round(accuracy_score(y_test,y_pred) * 100,1)
print(f'Word2Vec accuracy is {accuracy}%')

Word2Vec accuracy is 66.5%


## **3. Doc2Vec**

In [None]:
# Prepare data for docvector

df['cleaned_transcription'] = df['transcription'].apply(lambda x: clean_text(x,lemmatize=False,output_is_tokenized=True))
df.head(5)

Unnamed: 0,transcription,medical_specialty,cleaned_transcription
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"[2d, echocardiogrammultiple, views, heart, gre..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"[description1, normal, cardiac, chambers, size..."
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"[2d, study1, mild, aortic, stenosis, widely, c..."


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['cleaned_transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

X_train[:10]

4878    [reason, referral, chest, pain, possible, sync...
2878    [history, present, illness, patient, 61yearold...
3542    [procedure, flexible, sigmoidoscopypreoperativ...
4636    [operative, procedure1, thromboendarterectomy,...
4775    [preoperative, diagnoses1, acute, coronary, ar...
4880    [preoperative, diagnosis, critical, left, caro...
4849    [subjective, review, medical, record, shows, p...
4940    [procedure, diagnostic, fiberoptic, bronchosco...
2896    [report, 18channel, recording, obtained, using...
4871    [discovered, new, twave, abnormalities, ekg, c...
Name: cleaned_transcription, dtype: object

In [None]:
# Prepare data for doc2vec

!pip install gensim
import gensim

tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(vector,[index]) for index,vector in enumerate(X_train)]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(vector,[index]) for index,vector in enumerate(X_test)]

doc2vec_model = gensim.models.Doc2Vec(tagged_docs_train,vector_size=100,window=5,min_count=2)

X_train_vect = [doc2vec_model.infer_vector(transcription.words) for transcription in tagged_docs_train]
X_test_vect = [doc2vec_model.infer_vector(transcription.words) for transcription in tagged_docs_test]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
rf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1)
rf.fit(X_train_vect,y_train.values)
y_pred = rf.predict(X_test_vect)
accuracy = round(accuracy_score(y_test,y_pred) * 100,1)
print(f'Doc2Vec accuracy is {accuracy}%')

Doc2Vec accuracy is 70.1%


## **4. Count Vectorizer**

In [None]:
# Prepare data for count vectorizer

df['cleaned_transcription'] = df['transcription'].apply(lambda x: clean_text(x,lemmatize=True,output_is_tokenized=False))
df.head(5)

Unnamed: 0,transcription,medical_specialty,cleaned_transcription
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,2d mmode 1 left atrial enlargement left atrial...
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,1 left ventricular cavity size wall thickness ...
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,2d echocardiogrammultiple view heart great ves...
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,description1 normal cardiac chamber size2 norm...
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,2d study1 mild aortic stenosis widely calcifie...


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['cleaned_transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

X_train[:10]

4878    reason referral chest pain possible syncopal s...
2878    history present illness patient 61yearold righ...
3542    procedure flexible sigmoidoscopypreoperative d...
4636    operative procedure1 thromboendarterectomy rig...
4775    preoperative diagnoses1 acute coronary artery ...
4880    preoperative diagnosis critical left carotid s...
4849    subjective review medical record show patient ...
4940    procedure diagnostic fiberoptic bronchoscopyan...
2896    report 18channel recording obtained using stan...
4871    discovered new twave abnormality ekg course fo...
Name: cleaned_transcription, dtype: object

In [None]:
# Fit count vectorizer to X_train and transform X_train and X_test

from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer()
cvec.fit(X_train)
X_train_vector = cvec.transform(X_train)
X_test_vector = cvec.transform(X_test)

In [None]:
# Create Random Forest model and test accuracy

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1)
rf.fit(X_train_vector,y_train.values)
y_pred = rf.predict(X_test_vector)
accuracy = round(accuracy_score(y_test,y_pred) * 100,1)
print(f'CountVectorizer accuracy is {accuracy}%')

CountVectorizer accuracy is 89.0%


## **5. N-grams**

In [None]:
# Prepare data for count vectorizer

df['cleaned_transcription'] = df['transcription'].apply(lambda x: clean_text(x,lemmatize=True,output_is_tokenized=False))
df.head(5)

Unnamed: 0,transcription,medical_specialty,cleaned_transcription
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,2d mmode 1 left atrial enlargement left atrial...
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,1 left ventricular cavity size wall thickness ...
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,2d echocardiogrammultiple view heart great ves...
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,description1 normal cardiac chamber size2 norm...
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,2d study1 mild aortic stenosis widely calcifie...


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['cleaned_transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

X_train[:10]

4878    reason referral chest pain possible syncopal s...
2878    history present illness patient 61yearold righ...
3542    procedure flexible sigmoidoscopypreoperative d...
4636    operative procedure1 thromboendarterectomy rig...
4775    preoperative diagnoses1 acute coronary artery ...
4880    preoperative diagnosis critical left carotid s...
4849    subjective review medical record show patient ...
4940    procedure diagnostic fiberoptic bronchoscopyan...
2896    report 18channel recording obtained using stan...
4871    discovered new twave abnormality ekg course fo...
Name: cleaned_transcription, dtype: object

In [None]:
# Fit count vectorizer to X_train and transform X_train and X_test

from sklearn.feature_extraction.text import CountVectorizer

ngrams = CountVectorizer(ngram_range=(2,3))
ngrams.fit(X_train)
X_train_vector = ngrams.transform(X_train)
X_test_vector = ngrams.transform(X_test)
ngrams.get_feature_names()



['00 silk',
 '00 silk suture',
 '000 chromic',
 '000 chromic subcutaneous',
 '000 silk',
 '000 silk used',
 '0007 agarose',
 '0007 agarose gel',
 '001 followed',
 '001 followed 004',
 '0014 bmw',
 '0014 bmw wire',
 '0014 guide',
 '0014 guide wire',
 '0014 guidewire',
 '0014 guidewire ballooned',
 '0014 universal',
 '0014 universal wire',
 '0014inch asahi',
 '0014inch asahi soft',
 '0014inch wire',
 '0014inch wire advanced',
 '0018inch stiff',
 '0018inch stiff wire',
 '0025 glidewire',
 '0025 glidewire obtained',
 '0025 guide',
 '0025 guide wire',
 '0025 mg',
 '0025 mg vytorin',
 '002impression middleaged',
 '002impression middleaged white',
 '003 followed',
 '003 followed 018',
 '0035 glidewire',
 '0035 glidewire across',
 '0035 guide',
 '0035 guide wire',
 '0035 guidewire',
 '0035 guidewire coronary',
 '0035inch jwire',
 '0035inch jwire left',
 '0035inch wire',
 '0035inch wire left',
 '0035inch wireangiographic',
 '0035inch wireangiographic findings1',
 '0038 inch',
 '0038 inch 10',
 

In [None]:
# Create Random Forest model and test accuracy

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1)
rf.fit(X_train_vector,y_train.values)
y_pred = rf.predict(X_test_vector)
accuracy = round(accuracy_score(y_test,y_pred) * 100,1)
print(f'N-grams accuracy is {accuracy}%')

N-grams accuracy is 68.9%


## **6. Recurrent Neural Networks**

In [None]:
# Convert medical_specialty labels from string to numeric

df = df.replace({'medical_specialty':{'Neurology':0,'Cardiovascular / Pulmonary':1,'Gastroenterology':2}})
# Shuffle data
df = df.sample(frac=1)
df.head(5)

Unnamed: 0,transcription,medical_specialty
2863,"REASON FOR VISIT:, Followup status post L4-L5...",0
4695,"ADMISSION DIAGNOSES:,1. Pneumonia, likely sec...",1
3507,"PREOPERATIVE DIAGNOSIS: , Appendicitis.,POSTOP...",2
2806,"SOCIAL HISTORY, FAMILY HISTORY, AND PAST MEDIC...",0
2840,"CC:, Left sided weakness.,HX:, 74 y/o RHF awok...",0


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_train,X_test,y_train,y_test = train_test_split(df['transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

# Transform y sets to shape (n_rows,3) because it's a 3 labels multiclass classification task,
# to use later in rnn model. Alternatively one hot encode the target variable initially
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

X_train[:10]
y_train.shape

(654, 3)

In [None]:
# Prepare and vectorize transcription texts for RNN using Tensorflow's Tokenizer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Instantiate Tokenizer and fit on X_train data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Transform X_train and X_test data to numeric vectors (a number is assigned to each word in each transcription)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Because each transcription has different number of words, we use pad_sequences to convert them to equal size vectors
X_train_padded = pad_sequences(X_train_seq,100)
X_test_padded = pad_sequences(X_test_seq,100)

X_train_seq[0]
X_train_padded[0]

array([8128,   11, 1843,    5, 6395, 1450,  156,  257,   11, 4701, 8129,
         19,    1, 6395, 2590,   11,    9, 8130, 8131, 6396,   11,  127,
         39, 1357,   11, 1765, 3592,  271,    4,  317,    1,   10,  239,
         63, 2366, 1057,   27,   68,    8,   26, 1057,   11,   24,   33,
        899,  658, 2591,    2,  394,    4,  272,    2,  763,  602,    7,
         26, 1605,   11,   59, 2262,    4,   75, 1912,  845,   29,   49,
         29, 1844,   11,  127,  846,  329, 1115,  195,   13, 6397, 6398,
         18,    1,   14, 1115,    2,   24,   75,  996,    8,   25, 1115,
          1,  430,    4,   26,  271,    4,  317,    9,  193,  235,   91,
       3905], dtype=int32)

In [None]:
# Create RNN model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Bidirectional,LSTM,Dropout

rnn = Sequential([
    Embedding(len(tokenizer.index_word)+1,32),
    Bidirectional(LSTM(32,dropout=0.2,recurrent_dropout=0.2)),
    Dense(32,activation='relu'),
    Dense(3,activation='softmax')
])

rnn.summary()


Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_20 (Embedding)    (None, None, 32)          395424    
                                                                 
 bidirectional_11 (Bidirecti  (None, 64)               16640     
 onal)                                                           
                                                                 
 dense_48 (Dense)            (None, 32)                2080      
                                                                 
 dense_49 (Dense)            (None, 3)                 99        
                                                                 
Total params: 414,243
Trainable params: 414,243
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile and fit rnn to data

rnn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss='categorical_crossentropy',metrics=['accuracy'])

history = rnn.fit(X_train_padded,y_train,batch_size=32,epochs=15,validation_data=(X_test_padded,y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## **7. Fine tune Bert model-Transformers**

In [None]:
# Convert medical_specialty labels from string to numeric

df = df.replace({'medical_specialty':{'Neurology':0,'Cardiovascular / Pulmonary':1,'Gastroenterology':2}})
# Shuffle data
df = df.sample(frac=1)
df.head(5)

Unnamed: 0,transcription,medical_specialty
2863,"REASON FOR VISIT:, Followup status post L4-L5...",0
2876,"PREOPERATIVE DIAGNOSIS:, Right occipital arte...",0
2824,"CC:, Left hemiplegia.,HX: , A 58 y/o RHF awoke...",0
4638,"PREOPERATIVE DIAGNOSES:,1. Non-small-cell car...",1
2944,"CHIEF COMPLAINT: , Worsening seizures.,HISTORY...",0


In [None]:
# Split to train and test sets

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_train,X_test,y_train,y_test = train_test_split(df['transcription'],df['medical_specialty'],
                                                 test_size=0.2,random_state=21)

# Transform y sets to shape (n_rows,3) because it's a 3 labels multiclass classification task,
# to use later in rnn model. Alternatively one hot encode the target variable initially
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

X_train[:10]
y_train.shape

(654, 3)

In [None]:
# Prepare and vectorize transcription texts for pretrained model using Tensorflow's Tokenizer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Instantiate Tokenizer and fit on X_train data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Transform X_train and X_test data to numeric vectors (a number is assigned to each word in each transcription)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Because each transcription has different number of words, we use pad_sequences to convert them to equal size vectors
X_train_padded = pad_sequences(X_train_seq,100)
X_test_padded = pad_sequences(X_test_seq,100)

X_train_seq[0]
X_train_padded[0]

array([  25,   65,   20,  549,   94,   37,  141,   45,  177,   75,    2,
       4304, 2390,    4,   85,   75,  154,   81,  371,  240,    2,  177,
         75,  366, 3635,    8,    1, 4305,    4,  230, 8266,  141,    1,
         10,    9,   41, 3955,    8,  280,  843,   19,   21,   65,  792,
         49,  216, 4306,  202,  146,    7,  179, 5484,   28,   16,  138,
         41, 3956,  220,  231,   23,   49,   69,  762,    6,  179,  479,
          4,    1,  654, 1013,   20,  400,  986,   33, 1168, 5488,    4,
          1, 1134,  155,   19,    1, 1863,  209,   38,   49,   69,  408,
          1,  549,   94,   13,    1,   65,  458,   12,  202, 1134,    9,
       1868], dtype=int32)

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


ModuleNotFoundError: ignored

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import CategoricalCrossentropy

In [None]:
checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3,from_pt=True)
loss = CategoricalCrossentropy(from_logits=True)
model.compile(loss=loss,optimizer='adam',metrics=['accuracy'])

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(X_train_padded,y_train,batch_size=32,epochs=5,validation_data=(X_test_padded,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2a18f39310>