We will implement the bag of words model using 2 methods of text embeddding and a naive bayes classifier, attempt to tune them to obtain the best performance, and compare said performance.

## imports/functions/reading in data

In [None]:
import numpy as np
import sklearn
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 1]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks

def preprocess4(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 4]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks
def dummy_tok(doc):
  return doc
def performance(conf_mat):
  TP = 0
  for i in range(conf_mat.shape[0]):
    Class = df_labels['condition_name'][i]
    tp = conf_mat[i, i]
    fp = np.sum(conf_mat[:, i]) - tp
    fn = np.sum(conf_mat[i, :]) - tp
    tn = np.sum(conf_mat) - (tp + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    TP += tp
    print(Class.upper())
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")
  accuracy_tot = TP / np.sum(conf_mat)
  print(f"Total Accuracy: {accuracy_tot:.3f}")
  return

In [None]:
df_train = pd.read_csv('medical_tc_train.csv')
df_test = pd.read_csv('medical_tc_test.csv')
df_labels = pd.read_csv('medical_tc_labels.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

(11550, 2)
(2888, 2)


## CountVectorizer

In [None]:
corp_train = df_train['medical_abstract']
corp_test = df_test['medical_abstract']

We begin with a basic embedder, count vectorizer. Creates a dictionary of all words in the training corpus and returns a word frequency vector for each document.

In [None]:
count = CountVectorizer(preprocessor=preprocess, tokenizer=dummy_tok)

count_mat_train = count.fit_transform(corp_train)
print(count_mat_train.shape)



(11550, 28079)


Then transform the train data using the dictionary created before.

In [None]:
count_mat_test = count.transform(corp_test)
print(count_mat_test.shape)

(2888, 28079)


In [None]:
labels_train = df_train['condition_label']
labels_test = df_test['condition_label']

We use a multinomial naive bayes classifier as it can handle our large dataset efficiently.

In [None]:
classifier = MultinomialNB()

Fit the classifier using the embedded training data.

In [None]:
classifier.fit(count_mat_train, labels_train)

We make predictions on the embedded test data and compare with test data labels.

In [None]:
labels_pred = classifier.predict(count_mat_test)

In [None]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[490  40  35  14  54]
 [ 37 188   6  10  58]
 [ 28   8 240  32  77]
 [ 10  12  33 466  89]
 [171 141 131 207 311]]


In [None]:
performance(conf_mat)

NEOPLASMS
Precision: 0.67, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Precision: 0.48, Recall: 0.63
NERVOUS SYSTEM DISEASES
Precision: 0.54, Recall: 0.62
CARDIOVASCULAR DISEASES
Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.53, Recall: 0.32
Total Accuracy: 0.587


Seems to have good predictions for classes 1-4, but class 5 is causing problems.

In [None]:
print(df_labels)

   condition_label                   condition_name
0                1                        neoplasms
1                2        digestive system diseases
2                3          nervous system diseases
3                4          cardiovascular diseases
4                5  general pathological conditions


This is to be expected since abstracts belonging to 'general pathological conditions' will be less correlated than abstracts belonging to a more specific class.

In [None]:
class_counts = df_train['condition_label'].value_counts()
print(class_counts)

5    3844
1    2530
4    2441
3    1540
2    1195
Name: condition_label, dtype: int64


The general class is also the dominant class, therefore our overall accuracy will be quite poor.

## CountVectorizer, short words removed

We might expect that in the medical context, short words contain less information than longer ones. Therefore we will experiment with removing tokens of length 4 or less to improve performance.

In [None]:
count1 = CountVectorizer(preprocessor=preprocess4, tokenizer=dummy_tok)

count_mat_train1 = count1.fit_transform(corp_train)
print(count_mat_train1.shape)



(11550, 24875)


In [None]:
count_mat_test1 = count1.transform(corp_test)
print(count_mat_test1.shape)

(2888, 24875)


In [None]:
classifier1 = MultinomialNB()

In [None]:
classifier1.fit(count_mat_train1, labels_train)

In [None]:
labels_pred1 = classifier1.predict(count_mat_test1)

In [None]:
conf_mat1 = confusion_matrix(labels_test, labels_pred1)
print(conf_mat1)

[[501  36  32  14  50]
 [ 37 187   5   7  63]
 [ 30  11 240  33  71]
 [ 14  12  32 461  91]
 [168 148 131 207 307]]


In [None]:
performance(conf_mat1)

NEOPLASMS
Precision: 0.67, Recall: 0.79
DIGESTIVE SYSTEM DISEASES
Precision: 0.47, Recall: 0.63
NERVOUS SYSTEM DISEASES
Precision: 0.55, Recall: 0.62
CARDIOVASCULAR DISEASES
Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.53, Recall: 0.32
Total Accuracy: 0.587


We see no change in performance, but a slight improvement in efficiency. So we will proceed using this preprocesser.

## Tf-Idf

Now we will use tf-idf embedding, this method uses word frequency as before but also takes into account the rarity of a word across the corpus. This may lead to more information being captured and an improved performance for the classifier.

In [None]:
tfidf = TfidfVectorizer(preprocessor=preprocess4, tokenizer=dummy_tok)
tfidf_mat_train = tfidf.fit_transform(corp_train)



In [None]:
tfidf_mat_test = tfidf.transform(corp_test)
print(tfidf_mat_test.shape)

(2888, 24875)


In [None]:
classifier2 = MultinomialNB()

In [None]:
classifier2.fit(tfidf_mat_train, labels_train)

In [None]:
labels_pred2 = classifier2.predict(tfidf_mat_test)

In [None]:
conf_mat2 = confusion_matrix(labels_test, labels_pred2)
print(conf_mat2)

[[434   0   0   9 190]
 [ 30   6   1   7 255]
 [ 21   0  12  24 328]
 [  6   0   2 364 238]
 [111   1   7 140 702]]


In [None]:
performance(conf_mat2)

NEOPLASMS
Precision: 0.72, Recall: 0.69
DIGESTIVE SYSTEM DISEASES
Precision: 0.86, Recall: 0.02
NERVOUS SYSTEM DISEASES
Precision: 0.55, Recall: 0.03
CARDIOVASCULAR DISEASES
Precision: 0.67, Recall: 0.60
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.41, Recall: 0.73
Total Accuracy: 0.526


The classifier is heavily underpredicting classes 2 and 3, which we saw earlier are the least frequent in the dataset. We can attempt to fix this by setting a uniform prior on the classes, instead of the classifier learning the prior probabilities from the data.

In [None]:
classifier3 = MultinomialNB(fit_prior=False)

In [None]:
classifier3.fit(tfidf_mat_train, labels_train)

In [None]:
labels_pred3 = classifier3.predict(tfidf_mat_test)

In [None]:
conf_mat3 = confusion_matrix(labels_test, labels_pred3)
print(conf_mat3)

[[486   9   7  14 117]
 [ 42  42   2  11 202]
 [ 30   1  87  37 230]
 [ 11   2   7 435 155]
 [144  26  42 185 564]]


In [None]:
performance(conf_mat3)

NEOPLASMS
Precision: 0.68, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Precision: 0.53, Recall: 0.14
NERVOUS SYSTEM DISEASES
Precision: 0.60, Recall: 0.23
CARDIOVASCULAR DISEASES
Precision: 0.64, Recall: 0.71
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.44, Recall: 0.59
Total Accuracy: 0.559


 We can see a slight improvement, but the classifier is still underpredicting the underrepresented classes, this suggests overfitting. We can conduct a gridsearch to optimize the smoothing parameter alpha to prevent this.

## gridsearch

We create a pipeline containing the classifier.

In [None]:
pipeline = Pipeline([
    ('clf', MultinomialNB(fit_prior=False))])

We check values of alpha between 0 and 3 in increments of .1

In [None]:
params = np.arange(.1, 3, 0.1)
param_grid = {
    'clf__alpha': params}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

We feed our training data to the grid search which returns the optimal value.

In [None]:
grid_search.fit(tfidf_mat_train, labels_train)
print(grid_search.best_params_)

{'clf__alpha': 0.2}


Now let's see how this value of alpha affects our performance.

In [None]:
classifier4 = MultinomialNB(fit_prior=False, alpha=0.2)

In [None]:
classifier4.fit(tfidf_mat_train, labels_train)

In [None]:
labels_pred4 = classifier4.predict(tfidf_mat_test)

In [None]:
conf_mat4 = confusion_matrix(labels_test, labels_pred4)
print(conf_mat4)

[[497  29  35  17  55]
 [ 39 166   5  11  78]
 [ 32  12 213  38  90]
 [ 13  10  22 484  81]
 [170 126 120 220 325]]


In [None]:
performance(conf_mat4)

NEOPLASMS
Precision: 0.66, Recall: 0.79
DIGESTIVE SYSTEM DISEASES
Precision: 0.48, Recall: 0.56
NERVOUS SYSTEM DISEASES
Precision: 0.54, Recall: 0.55
CARDIOVASCULAR DISEASES
Precision: 0.63, Recall: 0.79
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.52, Recall: 0.34
Total Accuracy: 0.583


We see greatly improved results. Motivated by this, we also conduct a gridsearch to find the optimal smoothing parameter for the count vectorizer, also using the uniform prior to help with the unbalanced data.

In [None]:
grid_search.fit(count_mat_train1, labels_train)
print(grid_search.best_params_)

{'clf__alpha': 2.5000000000000004}


In [None]:
classifier5 = MultinomialNB(fit_prior=False, alpha=2.5)

In [None]:
classifier5.fit(count_mat_train1, labels_train)

In [None]:
labels_pred5 = classifier5.predict(count_mat_test1)

In [None]:
conf_mat5 = confusion_matrix(labels_test, labels_pred5)
print(conf_mat5)

[[512  23  26  14  58]
 [ 41 158   5   9  86]
 [ 32   7 213  36  97]
 [ 15   6  18 465 106]
 [166 107 110 205 373]]


In [None]:
performance(conf_mat5)

NEOPLASMS
Precision: 0.67, Recall: 0.81
DIGESTIVE SYSTEM DISEASES
Precision: 0.52, Recall: 0.53
NERVOUS SYSTEM DISEASES
Precision: 0.57, Recall: 0.55
CARDIOVASCULAR DISEASES
Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Precision: 0.52, Recall: 0.39
Total Accuracy: 0.596


Again we see improved results.

## Removing the general class

Now we will remove the general class, and compare our optimal approaches for each embedder. This gives us a better idea of the effectiveness of each method.

In [None]:
df_filt_train = df_train[df_train['condition_label'] != 5]
df_filt_train.reset_index(drop=True, inplace=True)
df_filt_test = df_test[df_test['condition_label'] != 5]
df_filt_test.reset_index(drop=True, inplace=True)

In [None]:
corp_filt_train = df_filt_train['medical_abstract']
corp_filt_test = df_filt_test['medical_abstract']
labels_filt_train = df_filt_train['condition_label']
labels_filt_test = df_filt_test['condition_label']

## Count Vectorizer final

In [None]:
count1 = CountVectorizer(preprocessor=preprocess4, tokenizer=dummy_tok)

count_mat_train1 = count1.fit_transform(corp_filt_train)
print(count_mat_train1.shape)



(7706, 21403)


In [None]:
count_mat_test1 = count1.transform(corp_filt_test)
print(count_mat_test1.shape)

(1927, 21403)


In [None]:
classifier6 = MultinomialNB(fit_prior=False,alpha = 2.5)

In [None]:
classifier6.fit(count_mat_train1, labels_filt_train)

In [None]:
labels_filt_pred1 = classifier6.predict(count_mat_test1)

In [None]:
conf_mat6 = confusion_matrix(labels_filt_test, labels_filt_pred1)
print(conf_mat6)

[[541  29  36  27]
 [ 48 221  11  19]
 [ 38  16 283  48]
 [ 23  20  34 533]]


In [None]:
performance(conf_mat6)

NEOPLASMS
Precision: 0.83, Recall: 0.85
DIGESTIVE SYSTEM DISEASES
Precision: 0.77, Recall: 0.74
NERVOUS SYSTEM DISEASES
Precision: 0.78, Recall: 0.74
CARDIOVASCULAR DISEASES
Precision: 0.85, Recall: 0.87
Total Accuracy: 0.819


## Tf-Idf final

In [None]:
tfidf1 = TfidfVectorizer(preprocessor=preprocess4, tokenizer=dummy_tok)
tfidf_mat_train1 = tfidf1.fit_transform(corp_filt_train)
print(tfidf_mat_train1.shape)



(7706, 21403)


In [None]:
tfidf_mat_test1 = tfidf1.transform(corp_filt_test)
print(tfidf_mat_test1.shape)

(1927, 21403)


In [None]:
classifier7 = MultinomialNB(fit_prior=False,alpha = .2)

In [None]:
classifier7.fit(tfidf_mat_train1, labels_filt_train)

In [None]:
labels_filt_pred2 = classifier7.predict(tfidf_mat_test1)

In [None]:
conf_mat7 = confusion_matrix(labels_filt_test, labels_filt_pred2)
print(conf_mat7)

[[517  43  42  31]
 [ 48 222   8  21]
 [ 39  18 278  50]
 [ 19  19  37 535]]


In [None]:
performance(conf_mat7)

NEOPLASMS
Precision: 0.83, Recall: 0.82
DIGESTIVE SYSTEM DISEASES
Precision: 0.74, Recall: 0.74
NERVOUS SYSTEM DISEASES
Precision: 0.76, Recall: 0.72
CARDIOVASCULAR DISEASES
Precision: 0.84, Recall: 0.88
Total Accuracy: 0.805


Count vectorizer is slightly more efficient and slightly outperforms tf-idf. Overall both embedders benefit from their simplicity and the method on the whole produces good predictions, when tuned, with good time efficiency.