## Imports 

In [1]:
!pip install pandas
!pip install nltk

Collecting pandas
  Using cached pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.1.4 tzdata-2023.3
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Collecting click
  Using cached click-8.1.7-py3-none-any.whl (97 kB)
Collecting joblib
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: regex, joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.3.2 nltk-3.8.1 regex-2023.10.3


In [32]:
import pandas as pd
import nltk
import re
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_labels = pd.read_csv('medical_tc_labels.csv')
df_train = pd.read_csv('medical_tc_train.csv')
df_test = pd.read_csv('medical_tc_test.csv')

Text preprocessing involves standardising text and discarding information that is not useful to our classifier.

In [37]:
sample_doc = df_train['medical_abstract'][1]
print(sample_doc)

Neuropeptide Y and neuron-specific enolase levels in benign and malignant pheochromocytomas. Neuron-specific enolase (NSE) is the isoform of enolase, a glycolytic enzyme found in the neuroendocrine system. Neuropeptide Y (NPY) is a peptide recently discovered in the peripheral and central nervous systems. Serum NSE and plasma NPY levels have been reported to be increased in some patients with pheochromocytoma. The authors evaluated whether the measurement of these molecules could help to discriminate between benign and malignant forms of pheochromocytoma. The NSE levels were normal in all patients with benign pheochromocytoma (n = 13) and elevated in one half of those with malignant pheochromocytoma (n = 13). Plasma NPY levels were on the average significantly higher in the malignant (177.1 +/- 38.9 pmol/l, n = 16) than in the benign forms of the disease (15.7 +/- 389 pmol/l, n = 24). However, there was no difference in the percentage of patients with elevated NPY levels. These results

Clearly the case of a character is irrelevant, so we make all characters lower case.

In [38]:
sample_doc = sample_doc.lower()
print(sample_doc)

neuropeptide y and neuron-specific enolase levels in benign and malignant pheochromocytomas. neuron-specific enolase (nse) is the isoform of enolase, a glycolytic enzyme found in the neuroendocrine system. neuropeptide y (npy) is a peptide recently discovered in the peripheral and central nervous systems. serum nse and plasma npy levels have been reported to be increased in some patients with pheochromocytoma. the authors evaluated whether the measurement of these molecules could help to discriminate between benign and malignant forms of pheochromocytoma. the nse levels were normal in all patients with benign pheochromocytoma (n = 13) and elevated in one half of those with malignant pheochromocytoma (n = 13). plasma npy levels were on the average significantly higher in the malignant (177.1 +/- 38.9 pmol/l, n = 16) than in the benign forms of the disease (15.7 +/- 389 pmol/l, n = 24). however, there was no difference in the percentage of patients with elevated npy levels. these results

We tokenize the document so that we can work with individual words (tokens)

In [39]:
sample_doc = nltk.word_tokenize(sample_doc)
print(sample_doc)

['neuropeptide', 'y', 'and', 'neuron-specific', 'enolase', 'levels', 'in', 'benign', 'and', 'malignant', 'pheochromocytomas', '.', 'neuron-specific', 'enolase', '(', 'nse', ')', 'is', 'the', 'isoform', 'of', 'enolase', ',', 'a', 'glycolytic', 'enzyme', 'found', 'in', 'the', 'neuroendocrine', 'system', '.', 'neuropeptide', 'y', '(', 'npy', ')', 'is', 'a', 'peptide', 'recently', 'discovered', 'in', 'the', 'peripheral', 'and', 'central', 'nervous', 'systems', '.', 'serum', 'nse', 'and', 'plasma', 'npy', 'levels', 'have', 'been', 'reported', 'to', 'be', 'increased', 'in', 'some', 'patients', 'with', 'pheochromocytoma', '.', 'the', 'authors', 'evaluated', 'whether', 'the', 'measurement', 'of', 'these', 'molecules', 'could', 'help', 'to', 'discriminate', 'between', 'benign', 'and', 'malignant', 'forms', 'of', 'pheochromocytoma', '.', 'the', 'nse', 'levels', 'were', 'normal', 'in', 'all', 'patients', 'with', 'benign', 'pheochromocytoma', '(', 'n', '=', '13', ')', 'and', 'elevated', 'in', 'one

Numbers and special characters are not specific to classes and so will not provide information about what class a document belongs to. Therefore, we remove all tokens that do not comprise solely of letters.

In [40]:
sample_doc = [tok for tok in sample_doc if re.match('^[a-zA-Z]+$', tok)]
print(sample_doc)

['neuropeptide', 'y', 'and', 'enolase', 'levels', 'in', 'benign', 'and', 'malignant', 'pheochromocytomas', 'enolase', 'nse', 'is', 'the', 'isoform', 'of', 'enolase', 'a', 'glycolytic', 'enzyme', 'found', 'in', 'the', 'neuroendocrine', 'system', 'neuropeptide', 'y', 'npy', 'is', 'a', 'peptide', 'recently', 'discovered', 'in', 'the', 'peripheral', 'and', 'central', 'nervous', 'systems', 'serum', 'nse', 'and', 'plasma', 'npy', 'levels', 'have', 'been', 'reported', 'to', 'be', 'increased', 'in', 'some', 'patients', 'with', 'pheochromocytoma', 'the', 'authors', 'evaluated', 'whether', 'the', 'measurement', 'of', 'these', 'molecules', 'could', 'help', 'to', 'discriminate', 'between', 'benign', 'and', 'malignant', 'forms', 'of', 'pheochromocytoma', 'the', 'nse', 'levels', 'were', 'normal', 'in', 'all', 'patients', 'with', 'benign', 'pheochromocytoma', 'n', 'and', 'elevated', 'in', 'one', 'half', 'of', 'those', 'with', 'malignant', 'pheochromocytoma', 'n', 'plasma', 'npy', 'levels', 'were', 'o

Tokens of lenth 1 are now either words of length 1 (not class specific) or a product of our previous steps (artificial/not words) in both cases we can remove them.

In [41]:
sample_doc = [tok for tok in sample_doc if len(tok) > 1]
print(sample_doc)

['neuropeptide', 'and', 'enolase', 'levels', 'in', 'benign', 'and', 'malignant', 'pheochromocytomas', 'enolase', 'nse', 'is', 'the', 'isoform', 'of', 'enolase', 'glycolytic', 'enzyme', 'found', 'in', 'the', 'neuroendocrine', 'system', 'neuropeptide', 'npy', 'is', 'peptide', 'recently', 'discovered', 'in', 'the', 'peripheral', 'and', 'central', 'nervous', 'systems', 'serum', 'nse', 'and', 'plasma', 'npy', 'levels', 'have', 'been', 'reported', 'to', 'be', 'increased', 'in', 'some', 'patients', 'with', 'pheochromocytoma', 'the', 'authors', 'evaluated', 'whether', 'the', 'measurement', 'of', 'these', 'molecules', 'could', 'help', 'to', 'discriminate', 'between', 'benign', 'and', 'malignant', 'forms', 'of', 'pheochromocytoma', 'the', 'nse', 'levels', 'were', 'normal', 'in', 'all', 'patients', 'with', 'benign', 'pheochromocytoma', 'and', 'elevated', 'in', 'one', 'half', 'of', 'those', 'with', 'malignant', 'pheochromocytoma', 'plasma', 'npy', 'levels', 'were', 'on', 'the', 'average', 'signifi

We now remove stopwords as they carry no information about the class.

In [42]:
sample_doc = [tok for tok in sample_doc if tok not in en_stop]
print(sample_doc)

['neuropeptide', 'enolase', 'levels', 'benign', 'malignant', 'pheochromocytomas', 'enolase', 'nse', 'isoform', 'enolase', 'glycolytic', 'enzyme', 'found', 'neuroendocrine', 'system', 'neuropeptide', 'npy', 'peptide', 'recently', 'discovered', 'peripheral', 'central', 'nervous', 'systems', 'serum', 'nse', 'plasma', 'npy', 'levels', 'reported', 'increased', 'patients', 'pheochromocytoma', 'authors', 'evaluated', 'whether', 'measurement', 'molecules', 'could', 'help', 'discriminate', 'benign', 'malignant', 'forms', 'pheochromocytoma', 'nse', 'levels', 'normal', 'patients', 'benign', 'pheochromocytoma', 'elevated', 'one', 'half', 'malignant', 'pheochromocytoma', 'plasma', 'npy', 'levels', 'average', 'significantly', 'higher', 'malignant', 'benign', 'forms', 'disease', 'however', 'difference', 'percentage', 'patients', 'elevated', 'npy', 'levels', 'results', 'show', 'determination', 'serum', 'nse', 'may', 'useful', 'distinguishing', 'malignant', 'benign', 'pheochromocytoma', 'measurement', 

Now we lemmatize, this reduces words to there basic form.

In [43]:
lemmatizer = WordNetLemmatizer()
sample_doc = [lemmatizer.lemmatize(tok) for tok in sample_doc]
print(sample_doc)

['neuropeptide', 'enolase', 'level', 'benign', 'malignant', 'pheochromocytoma', 'enolase', 'nse', 'isoform', 'enolase', 'glycolytic', 'enzyme', 'found', 'neuroendocrine', 'system', 'neuropeptide', 'npy', 'peptide', 'recently', 'discovered', 'peripheral', 'central', 'nervous', 'system', 'serum', 'nse', 'plasma', 'npy', 'level', 'reported', 'increased', 'patient', 'pheochromocytoma', 'author', 'evaluated', 'whether', 'measurement', 'molecule', 'could', 'help', 'discriminate', 'benign', 'malignant', 'form', 'pheochromocytoma', 'nse', 'level', 'normal', 'patient', 'benign', 'pheochromocytoma', 'elevated', 'one', 'half', 'malignant', 'pheochromocytoma', 'plasma', 'npy', 'level', 'average', 'significantly', 'higher', 'malignant', 'benign', 'form', 'disease', 'however', 'difference', 'percentage', 'patient', 'elevated', 'npy', 'level', 'result', 'show', 'determination', 'serum', 'nse', 'may', 'useful', 'distinguishing', 'malignant', 'benign', 'pheochromocytoma', 'measurement', 'plasma', 'npy'

Now we can create a function that applies these techniques to a document.

In [47]:
def preprocess(doc):
        #lowercases document
        doc = doc.lower()
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 1]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks

In [48]:
sample_doc0 = df_train['medical_abstract'][1]
sample_doc0 = preprocess(sample_doc0)
print(sample_doc0)

['neuropeptide', 'enolase', 'level', 'benign', 'malignant', 'pheochromocytoma', 'enolase', 'nse', 'isoform', 'enolase', 'glycolytic', 'enzyme', 'found', 'neuroendocrine', 'system', 'neuropeptide', 'npy', 'peptide', 'recently', 'discovered', 'peripheral', 'central', 'nervous', 'system', 'serum', 'nse', 'plasma', 'npy', 'level', 'reported', 'increased', 'patient', 'pheochromocytoma', 'author', 'evaluated', 'whether', 'measurement', 'molecule', 'could', 'help', 'discriminate', 'benign', 'malignant', 'form', 'pheochromocytoma', 'nse', 'level', 'normal', 'patient', 'benign', 'pheochromocytoma', 'elevated', 'one', 'half', 'malignant', 'pheochromocytoma', 'plasma', 'npy', 'level', 'average', 'significantly', 'higher', 'malignant', 'benign', 'form', 'disease', 'however', 'difference', 'percentage', 'patient', 'elevated', 'npy', 'level', 'result', 'show', 'determination', 'serum', 'nse', 'may', 'useful', 'distinguishing', 'malignant', 'benign', 'pheochromocytoma', 'measurement', 'plasma', 'npy'