In [None]:
from sklearn.metrics import *
import numpy as np
import torch
import torch.nn as nn
from collections import defaultdict
import math

top_50_diag = ['V34', '530', '820', '276', '532', '345', '560', '851', '780', '491', '965', '805', '482', '432', '396', '291', '519', '997', '197', '437', '415', '440', '433', '562', '577', '801', '162', '571', '430', '198', '584', '507', '578', '434', '486', '998', '441', '250', '427', '852', '431', 'V31', '996', '428', '518', '424', '410', '038', '414', 'V30']

top_50_dict = {}

for i in range(len(top_50_diag)):
	top_50_dict[top_50_diag[i]] = i

symptom_cnts = {}

Y = []
X = []
def y_vec(diag):
	vec = np.zeros((50,))
	idx = top_50_dict[diag]
	vec[idx] = 1
	return vec
	
symp_to_diags = defaultdict(set)
diag_to_symps = defaultdict(set)


In [None]:
"""
Used when run through google colab
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/symptoms.txt'
"""
file_path = 'symptoms.txt'

Mounted at /content/drive


# New Section

In [None]:
with open(file_path, newline='') as input:
	lines = input.readlines()
	for line in lines:
		words = line.split("|")
		diag = words[0]
		symps = words[1:]
		s = []
		for symp in symps:
			if (symp=="Discharge, body substance"):
				continue
			symp = symp.lower()
			if symp in symptom_cnts:
				symptom_cnts[symp] += 1
			else:
				symptom_cnts[symp] = 1
			s.append(symp)
			symp_to_diags[symp].add(diag)
			diag_to_symps[diag].add(symp)
		if len(s) > 2:
			Y.append(y_vec(diag))
			X.append(s)
		
symp_tfidf = {}
for symptom in symp_to_diags.keys():
	tfidf = np.zeros((50,))
	for i in range(len(top_50_diag)):
		tf = len(diag_to_symps[top_50_diag[i]])
		idf = math.log(50/len(symp_to_diags[symptom]), 10)
		tfidf[i] = tf*idf
	symp_tfidf[symptom] = tfidf

filler = np.zeros((50,))
for i in range(len(X)):
	sample = X[i]
	new_sample = []
	n = len(sample)
	for j in range(50):
		if j < n:
			symp = sample[j]
			new_sample.append(symp_tfidf[symp])
		else:
			new_sample.append(np.zeros((50,)))
	X[i] = new_sample

In [None]:
split = int(len(X)*0.8)
X_train = torch.from_numpy(np.asarray(X[:split], dtype=np.float32))
Y_train = torch.from_numpy(np.asarray(Y[:split], dtype=np.float32))
X_test = torch.from_numpy(np.asarray(X[split:], dtype=np.float32))
Y_test = torch.from_numpy(np.asarray(Y[split:], dtype=np.float32))

In [None]:
class Net(nn.Module):
	def __init__(self):
		super(Net, self).__init__()
		self.lstm = torch.nn.LSTM(input_size=50, hidden_size=100, num_layers=2, dropout=0.8, bidirectional=True)
		self.sigmoid = torch.nn.Sigmoid()

	def forward(self, x):
		h0 = torch.randn(4, 100, dtype=torch.float32) 
		c0 = torch.randn(4, 100, dtype=torch.float32)
		output, (hidden_states, cell_states) = self.lstm(x, (h0,c0))
		output = torch.sum(output, dim=1)
		return self.sigmoid(output)
		
		
model = Net()

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_test[0])


torch.Size([30788, 50, 50])
torch.Size([7698, 50, 50])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [None]:
criterion = None
optimizer = None

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def classification_metrics(Y_score, Y_pred, Y_true):
    acc, auc, precision, recall, f1score = accuracy_score(Y_true, Y_pred), \
                                           roc_auc_score(Y_true, Y_score), \
                                           precision_score(Y_true, Y_pred), \
                                           recall_score(Y_true, Y_pred), \
                                           f1_score(Y_true, Y_pred)
    return acc, auc, precision, recall, f1score

In [None]:
def evaluate(model, X, Y):
    model.eval()
    all_y_true = torch.LongTensor()
    all_y_pred = torch.LongTensor()
    all_y_score = torch.FloatTensor()
    for i in range(len(X)):
        # pass the input through the model
        y_hat = model(X[i])
        y_pred = (y_hat > 0.2).int()
        all_y_true = torch.cat((all_y_true, Y[i].to('cpu')), dim=0)
        all_y_pred = torch.cat((all_y_pred,  y_pred.to('cpu')), dim=0)
        all_y_score = torch.cat((all_y_score,  y_hat.to('cpu')), dim=0)
        
    acc, auc, precision, recall, f1 = classification_metrics(all_y_score.detach().numpy(), 
                                                             all_y_pred.detach().numpy(), 
                                                             all_y_true.detach().numpy())
    print(f"acc: {acc:.3f}, auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return

In [None]:
print("model perfomance before training:")
evaluate(model, X_train, Y_train)
evaluate(model, X_test, Y_test)

model perfomance before training:
acc: 0.158, auc: 0.546, precision: 0.020, recall: 0.867, f1: 0.040
acc: 0.181, auc: 0.542, precision: 0.021, recall: 0.875, f1: 0.041


In [None]:
n_epochs = 5

# prep model for training
model.train()

for epoch in range(n_epochs):
    
    train_loss = 0
    for i in range(len(X_train)):
        optimizer.zero_grad()
        y_hat = model(X_train[i])
        loss = criterion(y_hat, Y_train[i])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    train_loss = train_loss / len(X_train)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    evaluate(model, X_train, Y_train)
    evaluate(model, X_test, Y_test)

Epoch: 1 	Training Loss: 0.092390
acc: 0.980, auc: 0.731, precision: 0.134, recall: 0.001, f1: 0.002
acc: 0.980, auc: 0.722, precision: 0.000, recall: 0.000, f1: 0.000
Epoch: 2 	Training Loss: 0.087664
acc: 0.978, auc: 0.731, precision: 0.264, recall: 0.049, f1: 0.082
acc: 0.978, auc: 0.728, precision: 0.226, recall: 0.035, f1: 0.061
Epoch: 3 	Training Loss: 0.085584
acc: 0.978, auc: 0.717, precision: 0.246, recall: 0.056, f1: 0.092
acc: 0.978, auc: 0.745, precision: 0.204, recall: 0.040, f1: 0.066
Epoch: 4 	Training Loss: 0.085041
acc: 0.978, auc: 0.712, precision: 0.254, recall: 0.059, f1: 0.095
acc: 0.978, auc: 0.742, precision: 0.216, recall: 0.042, f1: 0.071
Epoch: 5 	Training Loss: 0.084946
acc: 0.977, auc: 0.723, precision: 0.247, recall: 0.067, f1: 0.106
acc: 0.977, auc: 0.747, precision: 0.215, recall: 0.052, f1: 0.084


In [None]:
#saved_model_path = '/content/drive/MyDrive/Colab Notebooks/TF-IDF.pth'
saved_model_path = 'TF-IDF.pth'
torch.save(model.state_dict(), saved_model_path)