## imports/functions/reading in

In [None]:
import numpy as np
import sklearn
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess(data,n):
    data_proc = []
    for doc in data['medical_abstract']:
        #lowercases document
        doc = doc.lower()
        #removes any non-letter characters
        doc = re.sub(r'\b[^a-zA-Z]+\b', ' ', doc)
        #tokenize
        toks = nltk.word_tokenize(doc)
        #remove tokens of lenth <= n (can be varied)
        toks = [tok for tok in toks if len(tok) > n]
        #remove stopwords
        toks = [tok for tok in toks if tok not in en_stop]
        #lemmatize
        toks = [WordNetLemmatizer().lemmatize(tok) for tok in toks]
        data_proc.append(toks)
    return data_proc
def dummy_preprocess(data):
  return data
def performance(conf_mat):
  TP = 0
  for i in range(conf_mat.shape[0]):
    Class = df_labels['condition_name'][i]
    tp = conf_mat[i, i]
    fp = np.sum(conf_mat[:, i]) - tp
    fn = np.sum(conf_mat[i, :]) - tp
    tn = np.sum(conf_mat) - (tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    TP += tp
    print(Class)
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
  accuracy_tot = TP / np.sum(conf_mat)
  print(f"Total Accuracy: {accuracy_tot:.2f}")
  return

In [None]:
df_train = pd.read_csv('medical_tc_train.csv')
df_test = pd.read_csv('medical_tc_test.csv')
df_labels = pd.read_csv('medical_tc_labels.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

(11550, 2)
(2888, 2)


In [None]:
df = pd.concat([df_train,df_test],axis = 0)
df.reset_index(drop=True, inplace=True)

## BoW with Naive Bayes

In [None]:
docs_proc1 = preprocess(df,1)

In [None]:
docs_clean1 = []
for doc in docs_proc:
    doc_clean = ' '.join(doc)
    docs_clean1.append(doc_clean)
print(docs_clean1[0])
len(docs_clean1)

tissue change around loose prosthesis canine model investigate effect antiinflammatory agent aseptically loosened prosthesis provided mean investigating vivo vitro activity cell associated loosening process seven dog cell isolated maintained culture sufficient period time biologic activity could studied well effect different agent added cell vivo vitro biologic response determined interleukin prostaglandin e2 activity paralleled roentgenographic appearance loosening technetium image observation made time revision surgery correlation clinical roentgenographic histologic biochemical loosening indicates canine model suitable investigating mechanism prosthetic failure canine model permit study possible nonsurgical therapeutic intervention ultimate hope stopping slowing loosening process


14438

In [None]:
count1 = CountVectorizer()

bow_mat1 = count1.fit_transform(docs_clean1)
print(bow_mat1.shape)
type(bow_mat1)

(14438, 33092)


scipy.sparse._csr.csr_matrix

In [None]:
bow_mat1_train = bow_mat1[:11550]
bow_mat1_test = bow_mat1[11550:]

In [None]:
print(bow_mat1_train.shape)
print(bow_mat1_test.shape)

(11550, 33092)
(2888, 33092)


In [None]:
labels_train = df_train['condition_label']
labels_test = df_test['condition_label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
classifier1 = MultinomialNB()

In [None]:
classifier1.fit(bow_mat1_train, labels_train)

In [None]:
labels_pred1 = classifier1.predict(bow_mat1_test)

In [None]:
conf_mat1 = confusion_matrix(labels_test, labels_pred1)
print(conf_mat1)

[[487  38  37  14  57]
 [ 37 182   7  10  63]
 [ 30   8 244  31  72]
 [ 13  12  34 466  85]
 [165 153 143 209 291]]


In [None]:
accuracy1 = accuracy_score(labels_test, labels_pred1)
print(accuracy1)

0.5782548476454293


seems to be a decent classifier for the classes 1-4 but 5 causing some trouble

In [None]:
performance(conf_mat1)

neoplasms
Accuracy: 0.86, Precision: 0.67, Recall: 0.77
digestive system diseases
Accuracy: 0.89, Precision: 0.46, Recall: 0.61
nervous system diseases
Accuracy: 0.87, Precision: 0.52, Recall: 0.63
cardiovascular diseases
Accuracy: 0.86, Precision: 0.64, Recall: 0.76
general pathological conditions
Accuracy: 0.67, Precision: 0.51, Recall: 0.30
Total Accuracy: 0.58


In [None]:
print(df_labels)

   condition_label                   condition_name
0                1                        neoplasms
1                2        digestive system diseases
2                3          nervous system diseases
3                4          cardiovascular diseases
4                5  general pathological conditions


to be expected since general pathological conditions is a more general class of abstracts

## BoW, Naive Bayes with length 4 (or less) tokens removed

lets see if removing short words reduces noise

In [None]:
docs_proc4 = []
for doc in docs_proc1:
  doc = [tok for tok in doc if len(tok) > 4]
  docs_proc4.append(doc)
print(docs_proc4[0])

['tissue', 'change', 'around', 'loose', 'prosthesis', 'canine', 'model', 'investigate', 'effect', 'antiinflammatory', 'agent', 'aseptically', 'loosened', 'prosthesis', 'provided', 'investigating', 'vitro', 'activity', 'associated', 'loosening', 'process', 'seven', 'isolated', 'maintained', 'culture', 'sufficient', 'period', 'biologic', 'activity', 'could', 'studied', 'effect', 'different', 'agent', 'added', 'vitro', 'biologic', 'response', 'determined', 'interleukin', 'prostaglandin', 'activity', 'paralleled', 'roentgenographic', 'appearance', 'loosening', 'technetium', 'image', 'observation', 'revision', 'surgery', 'correlation', 'clinical', 'roentgenographic', 'histologic', 'biochemical', 'loosening', 'indicates', 'canine', 'model', 'suitable', 'investigating', 'mechanism', 'prosthetic', 'failure', 'canine', 'model', 'permit', 'study', 'possible', 'nonsurgical', 'therapeutic', 'intervention', 'ultimate', 'stopping', 'slowing', 'loosening', 'process']


In [None]:
docs_clean4 = []
for doc in docs_proc4:
    doc_clean = ' '.join(doc)
    docs_clean4.append(doc_clean)

In [None]:
count4 = CountVectorizer()

bow_mat4 = count4.fit_transform(docs_clean4)
print(bow_mat4.shape)

(14438, 27794)


In [None]:
bow_mat4_train = bow_mat4[:11550]
bow_mat4_test = bow_mat4[11550:]

In [None]:
classifier4 = MultinomialNB()

In [None]:
classifier4.fit(bow_mat4_train, labels_train)

In [None]:
labels_pred4 = classifier4.predict(bow_mat4_test)

In [None]:
conf_mat4 = confusion_matrix(labels_test, labels_pred4)
print(conf_mat4)

[[498  36  35  13  51]
 [ 40 182   5   7  65]
 [ 30  11 245  33  66]
 [ 15  13  32 467  83]
 [166 160 136 207 292]]


In [None]:
accuracy4 = accuracy_score(labels_test, labels_pred4)
print(accuracy4)

0.5831024930747922


In [None]:
performance(conf_mat4)

neoplasms
Accuracy: 0.87, Precision: 0.66, Recall: 0.79
digestive system diseases
Accuracy: 0.88, Precision: 0.45, Recall: 0.61
nervous system diseases
Accuracy: 0.88, Precision: 0.54, Recall: 0.64
cardiovascular diseases
Accuracy: 0.86, Precision: 0.64, Recall: 0.77
general pathological conditions
Accuracy: 0.68, Precision: 0.52, Recall: 0.30


## tfidf implementation

lets implement tf-idf trandformation on the bow matrix

In [None]:
vectorizer = TfidfVectorizer()
tfidf_mat = vectorizer.fit_transform(docs_clean1)

In [None]:
tfidf_mat_train = tfidf_mat[:11550]
tfidf_mat_test = tfidf_mat[11550:]

In [None]:
classifier = MultinomialNB()

In [None]:
classifier.fit(tfidf_mat_train, labels_train)

In [None]:
labels_pred = classifier.predict(tfidf_mat_test)

In [None]:
conf_mat = confusion_matrix(labels_test, labels_pred)
print(conf_mat)

[[409   0   3  11 210]
 [ 29   5   1   8 256]
 [ 22   0  21  24 318]
 [  5   0   2 356 247]
 [117   0   6 135 703]]


In [None]:
accuracy = accuracy_score(labels_test, labels_pred)
print(accuracy)

0.5114265927977839


In [None]:
performance(conf_mat)

neoplasms
Accuracy: 0.87, Precision: 0.72, Recall: 0.64
digestive system diseases
Accuracy: 0.90, Precision: 1.00, Recall: 0.00
nervous system diseases
Accuracy: 0.87, Precision: 0.50, Recall: 0.01
cardiovascular diseases
Accuracy: 0.85, Precision: 0.67, Recall: 0.56
general pathological conditions
Accuracy: 0.54, Precision: 0.40, Recall: 0.75
Total Accuracy: 0.51


seem to be underprediciting underrepresented classes need to fix this