In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

pd.options.display.max_colwidth = 200

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install https://med7.s3.eu-west-2.amazonaws.com/en_core_med7_lg.tar.gz

## Contents

* [Read the dataset](#datasetreading)
* [Data Cleaning](#regex)
* [Stemming & Lemmatization](#stemming)
* [Tokenization](#tokenization)
* [Stop word removal](#stopword)

<a id='datasetreading'></a>

### Read the dataset

The dataset is collected from https://www.kaggle.com/c/medical-notes/data. It contains 800 anonymised transcribed medical reports with the disease category (specialty). For more information browse original source - https://www.mtsamples.com/

In [None]:
# We read all the medical notes from the directory
dir = '/kaggle/input/nlp-specialization-data/Medical_Notes/Medical_Notes'
print ("Total {} files in directory".format(len(os.listdir(dir))))

In [None]:
labels = pd.read_csv("/kaggle/input/nlp-specialization-data/Labels_Medical_Notes.csv",header=None)
labels.columns = ['file','label']

In [None]:
labels.head(5)

In [None]:
# read each medical notes and the corresponding label (disease category)
texts = []
classes = []

for i in tqdm(range(labels.shape[0])):
    filename = os.path.join(dir,labels.iloc[i]['file'])
    text = " ".join(open(filename,'r',errors='ignore').readlines())
    texts.append(text)
    classes.append(labels.iloc[i]['label'])
    
data = pd.DataFrame()
data['text'] = texts
data['label'] = classes

In [None]:
print (data.shape)

In [None]:
data.head(5)

<a id='regex'></a>

### Basic data cleaning

Natural language in its pure form can bring lot of noise. We need to clean the data in order to use any statistical/machine learning model. Below are the few techniques for cleaning the text data.

* Using RegEx (regular expressions) to identify the irrelevant text sections for removal
* Standardizing/normalizing texts like - abbreviations, spelling mistakes
* For social media data - remove smileys, email ids if these information are not relevant for downstream analysis


In [None]:
sample_text = data.text.iloc[3]
print (sample_text)

In [None]:
import re

We remove all the special characters like - "\n", HTML tags from the texts

In [None]:
def remove_html(text):
    text = text.replace("\n"," ")
    pattern = re.compile('<.*?>') #all the HTML tags
    return pattern.sub(r'', text)

In [None]:
sample_text_processed = remove_html(sample_text)
print (sample_text_processed)

Remove all the headings from text

In [None]:
def remove_headings(text):
    pattern = re.compile('\w+:')
    return pattern.sub(r'', text)

In [None]:
sample_text_processed = remove_headings(sample_text_processed)
print (sample_text_processed)

Remove &quot marks and other characters. Replace multiple spaces with single space

In [None]:
def replace_mult_spaces(text):
    text = text.replace("&quot","")
    pattern = re.compile(' +')
    text = pattern.sub(r' ', text)
    text = text.strip()
    return text

In [None]:
sample_text_processed = replace_mult_spaces(sample_text_processed)
print (sample_text_processed)

remove multiple consecutive spaces and replace with single space

In [None]:
def replace_other_chars(text):
    pattern = re.compile(r'[()!@&;]')
    text = pattern.sub(r'', text)
    return text

In [None]:
sample_text_processed = replace_other_chars(sample_text_processed)
print (sample_text_processed)

Putting everything together in a function and apply the cleaning on all the texts. Further, convert everything into lower case.

In [None]:
def clean_text(text):
    text = remove_html(text)
    text = remove_headings(text)
    text = replace_mult_spaces(text)
    text = replace_other_chars(text)
    text = text.lower()
    return text

In [None]:
data['clean_text'] = data.text.apply(clean_text)

<a id='eda'></a>

### Basic descriptive analysis on the texts

In [None]:
import matplotlib.pyplot as plt

data.clean_text.apply(len).plot.hist()
data.text.apply(len).plot.hist()
plt.title("Distribution of total number of characters in the clinical notes")
plt.legend(["before cleaning","after cleaning"])
plt.show()

In [None]:
data.clean_text.apply(lambda x: len(x.split())).plot.hist()
data.text.apply(lambda x: len(x.split())).plot.hist()
plt.title("Distribution of total number of words in the clinical notes")
plt.legend(["before cleaning","after cleaning"])
plt.show()

<a id='stemming'></a>

### Stemming and Lemmatization

Stemming changes word into its root stem. 

<img src = https://miro.medium.com/max/359/1*l65c30sY9fQsWPKIckqmCQ.png>

However, the root stem may not be lexicographically a correct word. Lemmatization on the other hand standardizes a word into its root word. Lemmatization deals with higher level of abstraction.

<img src = https://devopedia.org/images/article/227/6785.1570815200.png>


In [None]:
sample_text = data.clean_text.iloc[1]
print (sample_text)

In [None]:
import nltk

def simple_stemmer(text):
    ps = nltk.stem.SnowballStemmer('english')
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [None]:
stemmed_text = simple_stemmer(sample_text)
print (stemmed_text)

In [None]:
import spacy
import en_core_med7_lg #en_core_web_sm

nlp = en_core_med7_lg.load()

def simple_lemmatizer(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
lemmatized_text = simple_lemmatizer(sample_text)
print (lemmatized_text)

<a id='tokenization'></a>

### Tokenization

Tokenization splits a text into tokens or, words. Typically, words are splitted based on blank spaces. But tokenizations can also split words joined by other characters.

In [None]:
sample_text = data.clean_text.iloc[1]
doc = nlp(sample_text)
for token in doc:
    print(token.text, token.pos_)

<a id='stopword'></a>

### Stop word removal

Let us first see the most frequent words in the dataset

In [None]:
pd.Series(" ".join(data.clean_text.values).split()).value_counts().head(20)

Top 10 words based on frequency are english words like - articles, conjuctions, prepositions etc. These words often do not play in significant roles in the downstream applications. We need to remove these words to reduce the model complexity.

In [None]:
stopword_list = nltk.corpus.stopwords.words('english')

print (stopword_list[:10])

In [None]:
def lemmatize_and_remove_stopwords(text):
    doc = nlp(text)
    tokens = [word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
sample_text

In [None]:
sample_text_processed = lemmatize_and_remove_stopwords(sample_text)
print (sample_text_processed)

In [None]:
data.clean_text = data.clean_text.apply(lemmatize_and_remove_stopwords)

In [None]:
data.clean_text.apply(lambda x: len(x.split())).plot.hist()
plt.title("Distribution of total number of words in the texts")
plt.show()

In [None]:
#data.to_csv("clinical_notes_cleaned.csv",index=False)

### References for further reading

<strong> NLP overview - </strong> https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

<strong> Regular Expressions - </strong> https://regex101.com/ 

<strong> Spacy - </strong> https://spacy.io/usage/spacy-101

<strong> NLTK - </strong> https://www.nltk.org/book/

