### Common Machine Learning and Deep Learning Methods for Clinical Text Classification

- [Article Link](https://towardsdatascience.com/common-machine-learning-and-deep-learning-methods-for-clinical-text-classification-188473477a32)

In [4]:
# Librar Installs:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

from imblearn.over_sampling import SMOTE

import warnings
import pandas_profiling
import plotly.express as px
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv(r'E:\random_data\mtsamples.csv', index_col=0)

In [7]:
profiling = pandas_profiling.ProfileReport(data)
profiling.to_file("Dataframe_Report.html")
profiling

Summarize dataset: 100%|██████████| 20/20 [00:04<00:00,  4.19it/s, Completed]                         
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.97it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 334.23it/s]




In [8]:
filtered_data = data[['transcription', 'medical_specialty']]
filtered_data.loc[:, 'medical_specialty'] = filtered_data['medical_specialty'].apply(lambda x:str.strip(x))
mask = (filtered_data['medical_specialty'] == 'SOAP / Chart / Progress Notes') | \
       (filtered_data['medical_specialty'] == 'Office Notes') | \
       (filtered_data['medical_specialty'] == 'Consult - History and Phy.') | \
       (filtered_data['medical_specialty'] == 'Emergency Room Reports') | \
       (filtered_data['medical_specialty'] == 'Discharge Summary') | \
       (filtered_data['medical_specialty'] == 'Letters')
filtered_data = filtered_data[~mask]
data_categories  = filtered_data.groupby(filtered_data['medical_specialty'])
filtered_data_categories = data_categories.filter(lambda x:x.shape[0] > 100)
filtered_data_categories['medical_specialty'].value_counts()

Surgery                       1103
Cardiovascular / Pulmonary     372
Orthopedic                     355
Radiology                      273
General Medicine               259
Gastroenterology               230
Neurology                      223
Obstetrics / Gynecology        160
Urology                        158
Name: medical_specialty, dtype: int64

In [9]:
data = filtered_data_categories.sample(frac=1.0)

In [6]:
profiling = pandas_profiling.ProfileReport(data)
profiling.to_file("Dataframe_Report_Cleansed.html")
profiling

Summarize dataset: 100%|██████████| 18/18 [00:10<00:00,  1.65it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.37s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 96.59it/s]




In [7]:
df = data.dropna(subset=['transcription'])

In [8]:
#import scispacy
import spacy
# import en_ner_bionlp13cg_md

nlp = spacy.load("en_ner_bionlp13cg_md")

def medical_entities(text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return ' '.join(entities)

df['transcription_med'] = df['transcription'].apply(medical_entities)

In [9]:
df.head()

Unnamed: 0,transcription,medical_specialty,transcription_med
882,"PREOPERATIVE DIAGNOSIS:, Bilateral ear lacera...",Surgery,ear ear ear laceration ear laceration Xylocain...
4650,"PREOPERATIVE DIAGNOSIS: , Malignant pleural ef...",Cardiovascular / Pulmonary,Malignant pleural Malignant pleural patient pa...
2612,"HISTORY OF PRESENT ILLNESS: , This is the case...",Obstetrics / Gynecology,pelvic vaginal patient hCG hCG patient corpus ...
3309,"SUBJECTIVE:, This 3-year-old male is brought ...",General Medicine,"stomach celery Bowel airway murmurs.,Abdomen o..."
2858,"CHIEF COMPLAINT:, Headache.,HPI: , This is a ...",Neurology,photophobia BP oropharynx sclera Abdomen DTRs ...


In [11]:
import re

def clean_text(text):

    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to  datasets
df['transcription_med_cleaned'] = df['transcription_med'].apply(lambda x: clean_text(x))

In [None]:
# Build tokenizer

# https://medium.com/swlh/language-modelling-with-nltk-20eac7e70853