## imports/functions/reading in

In [1]:
import numpy as np
import sklearn
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [96]:
def preprocess(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 1]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks

def preprocess4(doc):
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > 4]
        #remove tokens that dont exclusively contain letters
        toks = [tok for tok in toks if re.match('^[a-zA-Z]+$', tok)]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        return toks

def performance(conf_mat):
  TP = 0
  for i in range(conf_mat.shape[0]):
    Class = df_labels['condition_name'][i]
    tp = conf_mat[i, i]
    fp = np.sum(conf_mat[:, i]) - tp
    fn = np.sum(conf_mat[i, :]) - tp
    tn = np.sum(conf_mat) - (tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    TP += tp
    print(Class.upper())
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
  accuracy_tot = TP / np.sum(conf_mat)
  print(f"Total Accuracy: {accuracy_tot:.2f}")
  return

In [4]:
df_train = pd.read_csv('medical_tc_train.csv')
df_test = pd.read_csv('medical_tc_test.csv')
df_labels = pd.read_csv('medical_tc_labels.csv')

In [5]:
print(df_train.shape)
print(df_test.shape)

(11550, 2)
(2888, 2)


In [6]:
df = pd.concat([df_train,df_test],axis = 0)
df.reset_index(drop=True, inplace=True)

## Term frequency with Naive Bayes

In [27]:
corp_train = df_train['medical_abstract']
corp_test = df_test['medical_abstract']

In [84]:
count = CountVectorizer(preprocessor=preprocess, tokenizer=tok)

count_mat_train = count.fit_transform(corp_train)
print(count_mat_train.shape)

(11550, 28079)


In [79]:
vocabulary = count.vocabulary_
print(len(vocabulary))

24875


In [36]:
count_mat_test = count.transform(corp_test)
print(count_mat_test.shape)

(2888, 28079)


In [37]:
labels_train = df_train['condition_label']
labels_test = df_test['condition_label']

In [38]:
classifier = MultinomialNB()

In [39]:
classifier.fit(count_mat_train, labels_train)

In [40]:
labels_pred = classifier.predict(count_mat_test)

In [41]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[490  40  35  14  54]
 [ 37 188   6  10  58]
 [ 28   8 240  32  77]
 [ 10  12  33 466  89]
 [171 141 131 207 311]]


In [48]:
performance(conf_mat)

NEOPLASMS
Accuracy: 0.87, Precision: 0.67, Recall: 0.77
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.48, Recall: 0.63
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.54, Recall: 0.62
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.76
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.32
Total Accuracy: 0.59


seems to be a decent classifier for the classes 1-4 but 5 causing some trouble

In [None]:
print(df_labels)

   condition_label                   condition_name
0                1                        neoplasms
1                2        digestive system diseases
2                3          nervous system diseases
3                4          cardiovascular diseases
4                5  general pathological conditions


to be expected since general pathological conditions is a more general class of abstracts

## Term frequency with length 4 or less tokens removed

In [97]:
count4 = CountVectorizer(preprocessor=preprocess4, tokenizer=tok)

bow_mat_train4 = count4.fit_transform(corp_train)
print(bow_mat_train4.shape)



(11550, 18880)


In [101]:
bow_mat_test4 = count4.transform(corp_test)
print(bow_mat_test4.shape)

(2888, 18880)


In [102]:
classifier1 = MultinomialNB()

In [103]:
classifier1.fit(bow_mat_train4, labels_train)

In [104]:
labels_pred4 = classifier1.predict(bow_mat_test4)

In [105]:
conf_mat4 = confusion_matrix(labels_test, labels_pred4)
print(conf_mat4)

[[483  38  37  17  58]
 [ 40 178   7  11  63]
 [ 29  14 236  32  74]
 [ 14   7  34 451 104]
 [168 136 131 195 331]]


In [106]:
performance(conf_mat4)

NEOPLASMS
Accuracy: 0.86, Precision: 0.66, Recall: 0.76
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.89, Precision: 0.48, Recall: 0.60
NERVOUS SYSTEM DISEASES
Accuracy: 0.88, Precision: 0.53, Recall: 0.61
CARDIOVASCULAR DISEASES
Accuracy: 0.86, Precision: 0.64, Recall: 0.74
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.68, Precision: 0.53, Recall: 0.34
Total Accuracy: 0.58


We see a tiny increase in performance, yet no increase in efficiency since the classifier is so fast

## Term Frequency-Inverse Document Frequency

In [113]:
tfidf = TfidfVectorizer(preprocessor=preprocess4, tokenizer=tok)
tfidf_mat_train = tfidf.fit_transform(corp_train)



In [119]:
tfidf_mat_test = tfidf.transform(corp_test)
print(tfidf_mat_test.shape)

(2888, 18880)


In [117]:
classifier2 = MultinomialNB()

In [118]:
classifier2.fit(tfidf_mat_train, labels_train)

In [121]:
labels_pred = classifier2.predict(tfidf_mat_test)

In [122]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[405   0   3  12 213]
 [ 31   5   1  10 252]
 [ 22   0  21  23 319]
 [  6   0   1 362 241]
 [116   0   5 139 701]]


In [123]:
accuracy = accuracy_score(labels_test, labels_pred)
print(accuracy)

0.5173130193905817


In [124]:
performance(conf_mat)

NEOPLASMS
Accuracy: 0.86, Precision: 0.70, Recall: 0.64
DIGESTIVE SYSTEM DISEASES
Accuracy: 0.90, Precision: 1.00, Recall: 0.02
NERVOUS SYSTEM DISEASES
Accuracy: 0.87, Precision: 0.68, Recall: 0.05
CARDIOVASCULAR DISEASES
Accuracy: 0.85, Precision: 0.66, Recall: 0.59
GENERAL PATHOLOGICAL CONDITIONS
Accuracy: 0.56, Precision: 0.41, Recall: 0.73
Total Accuracy: 0.52


seem to be underprediciting underrepresented classes need to fix this

## to do

filter extremes from dictionary, fix classifier in tfidf, look at removing general category