## imports/functions/reading in

In [1]:
import numpy as np
import sklearn
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def preprocess(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 1]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks

def preprocess4(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 4]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks
def tok(doc):
  return doc
def performance(conf_mat):
  TP = 0
  for i in range(conf_mat.shape[0]):
    Class = df_labels['condition_name'][i]
    tp = conf_mat[i, i]
    fp = np.sum(conf_mat[:, i]) - tp
    fn = np.sum(conf_mat[i, :]) - tp
    tn = np.sum(conf_mat) - (tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    TP += tp
    print(Class.upper())
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
  accuracy_tot = TP / np.sum(conf_mat)
  print(f"Total Accuracy: {accuracy_tot:.3f}")
  return

In [3]:
df_train = pd.read_csv('medical_tc_train.csv')
df_test = pd.read_csv('medical_tc_test.csv')
df_labels = pd.read_csv('medical_tc_labels.csv')

In [4]:
print(df_train.shape)
print(df_test.shape)

(11550, 2)
(2888, 2)


In [5]:
df = pd.concat([df_train,df_test],axis = 0)
df.reset_index(drop=True, inplace=True)

## CountVectorizer

In [165]:
corp_train = df_train['medical_abstract']
corp_test = df_test['medical_abstract']

In [166]:
count = CountVectorizer(preprocessor=preprocess, tokenizer=tok)

count_mat_train = count.fit_transform(corp_train)
print(count_mat_train.shape)



(11550, 28079)


In [167]:
count_mat_test = count.transform(corp_test)
print(count_mat_test.shape)

(2888, 28079)


In [168]:
labels_train = df_train['condition_label']
labels_test = df_test['condition_label']

In [169]:
classifier = MultinomialNB()

In [170]:
classifier.fit(count_mat_train, labels_train)

In [171]:
labels_pred = classifier.predict(count_mat_test)

In [172]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[490  40  35  14  54]
 [ 37 188   6  10  58]
 [ 28   8 240  32  77]
 [ 10  12  33 466  89]
 [171 141 131 207 311]]


In [173]:
performance(conf_mat)

NEOPLASMS
Accuracy: 0.87, Precision: 0.67, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.48, Recall: 0.63
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.54, Recall: 0.62
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.32
Total Accuracy: 0.587


Seems to be a decent classifier for the classes 1-4 but 5 causing some trouble.

In [174]:
print(df_labels)

   condition_label                   condition_name
0                1                        neoplasms
1                2        digestive system diseases
2                3          nervous system diseases
3                4          cardiovascular diseases
4                5  general pathological conditions


This is to be expected since general pathological conditions is a more general class of abstracts

## CountVectorizer, short words removed

Now we will remove tokens of length 4 or less

In [227]:
count1 = CountVectorizer(preprocessor=preprocess4, tokenizer=tok)

count_mat_train1 = count1.fit_transform(corp_train)
print(count_mat_train1.shape)



(11550, 24875)


In [228]:
count_mat_test1 = count1.transform(corp_test)
print(count_mat_test1.shape)

(2888, 24875)


In [229]:
classifier1 = MultinomialNB()

In [230]:
classifier1.fit(count_mat_train1, labels_train)

In [231]:
labels_pred1 = classifier1.predict(count_mat_test1)

In [232]:
conf_mat1 = confusion_matrix(labels_test, labels_pred1)
print(conf_mat1)

[[501  36  32  14  50]
 [ 37 187   5   7  63]
 [ 30  11 240  33  71]
 [ 14  12  32 461  91]
 [168 148 131 207 307]]


In [233]:
performance(conf_mat1)

NEOPLASMS
Accuracy: 0.87, Precision: 0.67, Recall: 0.79
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.47, Recall: 0.63
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.55, Recall: 0.62
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.32
Total Accuracy: 0.587


We see no change in performance, but an improvement in efficiency. We could make the interpretation that most medical terms that are specific to each class (and therefore hold more information) are 'long' and so removing short words won't hinder our classifier.

## Tf-Idf

In [183]:
tfidf = TfidfVectorizer(preprocessor=preprocess4, tokenizer=tok)
tfidf_mat_train = tfidf.fit_transform(corp_train)



In [184]:
tfidf_mat_test = tfidf.transform(corp_test)
print(tfidf_mat_test.shape)

(2888, 24875)


In [185]:
classifier2 = MultinomialNB()

In [186]:
classifier2.fit(tfidf_mat_train, labels_train)

In [187]:
labels_pred2 = classifier2.predict(tfidf_mat_test)

In [188]:
conf_mat2 = confusion_matrix(labels_test, labels_pred2)
print(conf_mat2)

[[434   0   0   9 190]
 [ 30   6   1   7 255]
 [ 21   0  12  24 328]
 [  6   0   2 364 238]
 [111   1   7 140 702]]


In [189]:
performance(conf_mat2)

NEOPLASMS
Accuracy: 0.87, Precision: 0.72, Recall: 0.69
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.90, Precision: 0.86, Recall: 0.02
NERVOUS SYSTEM DISEASES
Accuracy: 0.87, Precision: 0.55, Recall: 0.03
CARDIOVASCULAR DISEASES
Accuracy: 0.85, Precision: 0.67, Recall: 0.60
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.56, Precision: 0.41, Recall: 0.73
Total Accuracy: 0.526


seem to be underprediciting the underrepresented classes, we can attempt to fix this by setting a uniform prior on the classes, instead of the classifier learning the prior probabilities from the data.

In [190]:
classifier3 = MultinomialNB(fit_prior=False)

In [191]:
classifier3.fit(tfidf_mat_train, labels_train)

In [192]:
labels_pred3 = classifier3.predict(tfidf_mat_test)

In [193]:
conf_mat3 = confusion_matrix(labels_test, labels_pred3)
print(conf_mat3)

[[486   9   7  14 117]
 [ 42  42   2  11 202]
 [ 30   1  87  37 230]
 [ 11   2   7 435 155]
 [144  26  42 185 564]]


In [194]:
performance(conf_mat3)

NEOPLASMS
Accuracy: 0.87, Precision: 0.68, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.90, Precision: 0.53, Recall: 0.14
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.60, Recall: 0.23
CARDIOVASCULAR DISEASES
Accuracy: 0.85, Precision: 0.64, Recall: 0.71
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.62, Precision: 0.44, Recall: 0.59
Total Accuracy: 0.559


 We can see a slight improvement, but the classifier is still overfitting. Can conduct a gridsearch to optimize the smoothing parameter alpha to prevent this.

## gridsearch

In [195]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [196]:
pipeline = Pipeline([
    ('clf', MultinomialNB(fit_prior=False))])

In [224]:
params = np.arange(.1, 2.6, 0.1)
param_grid = {
    'clf__alpha': params}

In [207]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

In [236]:
grid_search.fit(tfidf_mat_train, labels_train)
print(grid_search.best_params_)

{'clf__alpha': 0.2}


In [219]:
classifier4 = MultinomialNB(fit_prior=False, alpha=0.2)

In [220]:
classifier4.fit(tfidf_mat_train, labels_train)

In [221]:
labels_pred4 = classifier4.predict(tfidf_mat_test)

In [222]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[490  40  35  14  54]
 [ 37 188   6  10  58]
 [ 28   8 240  32  77]
 [ 10  12  33 466  89]
 [171 141 131 207 311]]


In [214]:
performance(conf_mat)

NEOPLASMS
Accuracy: 0.87, Precision: 0.67, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.48, Recall: 0.63
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.54, Recall: 0.62
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.32
Total Accuracy: 0.587


Motivated by this, we can also conduct a gridsearch to find the optimal smoothing parameter for the count vectorizer, keeping the uniform prior to improve representation

In [237]:
grid_search.fit(count_mat_train1, labels_train)
print(grid_search.best_params_)

{'clf__alpha': 1.8000000000000003}


In [238]:
classifier5 = MultinomialNB(fit_prior=False, alpha=1.8)

In [240]:
classifier5.fit(count_mat_train1, labels_train)

In [241]:
labels_pred5 = classifier5.predict(count_mat_test1)

In [242]:
conf_mat5 = confusion_matrix(labels_test, labels_pred5)
print(conf_mat5)

[[504  30  31  14  54]
 [ 39 175   5   7  73]
 [ 31   8 235  34  77]
 [ 15   9  25 467  94]
 [168 128 121 207 337]]


In [243]:
performance(conf_mat5)

NEOPLASMS
Accuracy: 0.87, Precision: 0.67, Recall: 0.80
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.90, Precision: 0.50, Recall: 0.59
NERVOUS SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.56, Recall: 0.61
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.77
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.35
Total Accuracy: 0.595


## removing general class

Now we will remove the general class, and compare our optimal approaches for each embedder

In [244]:
df_filt_train = df_train[df_train['condition_label'] != 5]
df_filt_train.reset_index(drop=True, inplace=True)
df_filt_test = df_test[df_test['condition_label'] != 5]
df_filt_test.reset_index(drop=True, inplace=True)

In [245]:
corp_filt_train = df_filt_train['medical_abstract']
corp_filt_test = df_filt_test['medical_abstract']
labels_filt_train = df_filt_train['condition_label']
labels_filt_test = df_filt_test['condition_label']

In [246]:
count_filt = CountVectorizer(preprocessor=preprocess4, tokenizer=tok)

count_mat_filt_train = count_filt.fit_transform(corp_filt_train)
print(count_mat_filt_train.shape)



(7706, 21403)


In [123]:
count_mat_filt_test = count_filt.transform(corp_filt_test)
print(count_mat_filt_test.shape)

(1927, 21403)


In [253]:
classifier6 = MultinomialNB(fit_prior=False,alpha = 1.8)

In [254]:
classifier6.fit(count_mat_filt_train, labels_filt_train)

In [255]:
labels_filt_pred1 = classifier6.predict(count_mat_filt_test)

In [256]:
conf_mat6 = confusion_matrix(labels_filt_test, labels_filt_pred1)
print(conf_mat6)

[[525  40  44  24]
 [ 45 228  10  16]
 [ 37  16 284  48]
 [ 21  21  39 529]]


In [257]:
performance(conf_mat6)

NEOPLASMS
Accuracy: 0.89, Precision: 0.84, Recall: 0.83
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.92, Precision: 0.75, Recall: 0.76
NERVOUS SYSTEM DISEASES
Accuracy: 0.90, Precision: 0.75, Recall: 0.74
CARDIOVASCULAR DISEASES
Accuracy: 0.91, Precision: 0.86, Recall: 0.87
Total Accuracy: 0.813


## tfidf filtered

In [280]:
tfidf1 = TfidfVectorizer(preprocessor=preprocess4, tokenizer=tok)
tfidf_mat_train = tfidf1.fit_transform(corp_filt_train)
print(tfidf_mat_train.shape)



(7706, 21403)


In [283]:
tfidf_mat_filt_test = tfidf1.transform(corp_filt_test)
print(tfidf_mat_filt_test.shape)

(1927, 21403)


In [303]:
classifier7 = MultinomialNB(fit_prior=False,alpha = .2)

In [304]:
classifier7.fit(tfidf_mat_filt_train, labels_filt_train)

In [305]:
labels_filt_pred2 = classifier7.predict(tfidf_mat_filt_test)

In [306]:
conf_mat7 = confusion_matrix(labels_filt_test, labels_filt_pred2)
print(conf_mat7)

[[517  43  42  31]
 [ 48 222   8  21]
 [ 39  18 278  50]
 [ 19  19  37 535]]


In [307]:
performance(conf_mat7)

NEOPLASMS
Accuracy: 0.88, Precision: 0.83, Recall: 0.82
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.92, Precision: 0.74, Recall: 0.74
NERVOUS SYSTEM DISEASES
Accuracy: 0.90, Precision: 0.76, Recall: 0.72
CARDIOVASCULAR DISEASES
Accuracy: 0.91, Precision: 0.84, Recall: 0.88
Total Accuracy: 0.805
