In [25]:
import nltk
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os
import torch
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer ,LancasterStemmer
import string

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix

In [2]:
base_path = "pubmed-rct/PubMed_200k_RCT/"
with open(os.path.join(base_path,"train.txt") , "r") as f:
    train_data = f.readlines()

with open(os.path.join(base_path,"dev.txt") , "r") as f:
    dev_data = f.readlines()

with open(os.path.join(base_path,"test.txt") , "r") as f:
    test_data = f.readlines()

In [3]:
train_data_cleaned = []
for i in range(len(train_data)):
    if not train_data[i].startswith("###") and not train_data[i].startswith("\n"):
        train_data_cleaned.append(train_data[i])

dev_data_cleaned = []
for i in range(len(dev_data)):
    if not dev_data[i].startswith("###") and not dev_data[i].startswith("\n"):
        dev_data_cleaned.append(dev_data[i])

test_data_cleaned = []
for i in range(len(test_data)):
    if not test_data[i].startswith("###") and not test_data[i].startswith("\n"):
        test_data_cleaned.append(test_data[i])

train_data = train_data_cleaned
dev_data = dev_data_cleaned
test_data = test_data_cleaned

print(len(train_data))

2211861


In [4]:
train_data = [ x.split("\t") for  x in train_data]
train_data = np.asarray(train_data,'S')

dev_data = [ x.split("\t") for  x in dev_data]
dev_data = np.asarray(dev_data,'S')

test_data = [ x.split("\t") for  x in test_data]
test_data = np.asarray(test_data,'S')

train_Y , train_X = train_data.T
dev_Y , dev_X = dev_data.T
test_Y , test_X = test_data.T

In [5]:
# max_range = 0
# min_range = 10
# for x  in train_data:
#     if(len(x) != 2):
#         print(x)
#     max_range = max(max_range , len(x))
#     min_range = min(min_range , len(x))

# print(min_range, max_range)

In [6]:
print("The labels in the dataset are :",np.unique(train_Y))
train_Y = train_Y.reshape([-1,1])
ohe = OneHotEncoder()
ohe.fit(train_Y)
train_Y = ohe.transform(train_Y)
print(train_Y.shape)

The labels in the dataset are : [b'BACKGROUND' b'CONCLUSIONS' b'METHODS' b'OBJECTIVE' b'RESULTS']
(2211861, 5)


In [7]:
train_X = train_X.tolist()

In [8]:
stop_words = set(stopwords.words('english'))
print(train_X[:2])
train_X_split = []
lancaster=LancasterStemmer()
for i in tqdm(range(len(train_X))):
    train_X[i] = re.sub(r'\d+', '#', train_X[i].decode("utf-8") )
    word_tokens = word_tokenize(train_X[i])
    filtered_sentence = [lancaster.stem(w.lower()) for w in word_tokens if not w.lower() in stop_words and w not in string.punctuation]
    train_X[i] = " ".join(filtered_sentence)

print(train_X[:2])

[b'The emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments .\n', b'This paper describes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV .\n']


100%|██████████| 2211861/2211861 [09:48<00:00, 3760.75it/s]

['emerg hiv chronic condit mean peopl liv hiv requir tak respons self-management condit includ mak phys emot soc adjust', 'pap describ design evalu posit outlook onlin program aim enh self-management skil gay men liv hiv']





In [9]:
print(train_X[:5])

['emerg hiv chronic condit mean peopl liv hiv requir tak respons self-management condit includ mak phys emot soc adjust', 'pap describ design evalu posit outlook onlin program aim enh self-management skil gay men liv hiv', 'study design random control tri men liv hiv austral assign eith interv group us car control group', 'interv group particip onlin group program posit outlook', 'program bas self-efficacy the us self-management approach enh skil confid abl man psychosoc issu assocy hiv dai lif']


In [10]:
vectorizer = TfidfVectorizer()
train_X = vectorizer.fit_transform(train_X)
vectorizer.get_feature_names_out()
print(train_X.shape)

(2211861, 112026)


In [44]:
class SparseDataset(Dataset):

    def __init__(self, mat_csc, label, device='cpu'):
        self.dim = mat_csc.shape
        self.device = torch.device(device)
        
        self.data = mat_csc
        self.label = label

    def __len__(self):
        return self.dim[0]

    def __getitem__(self, idx):
        
        data = torch.FloatTensor(self.data[idx].toarray()).flatten().to(self.device)
        label = torch.FloatTensor(self.label[idx].toarray()).flatten().to(self.device)
        # print(data.shape)
        return data , label

In [56]:
model = torch.nn.Sequential(
    torch.nn.Linear(train_X.shape[1], 2048),
    torch.nn.ReLU(),
    torch.nn.Linear(2048, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 5)#,
    # torch.nn.Softmax()
)

dataset = SparseDataset(train_X, train_Y, "cuda")
dataloader = DataLoader(dataset=dataset,batch_size=256)

model.to("cuda")

loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)


In [58]:
for e in range(5):
    print("Starting Epoch %d"%e)
    pbar = tqdm(dataloader)
    ctr = 0
    total_loss = 0
    for x,y in pbar:
        # print(x,y)
        y_pred = model(x)

        # Compute and print loss.
        loss = loss_fn(y_pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss = total_loss + loss.item()
        if ctr % 49 == 0:
            pbar.set_postfix({'Average loss(last 50 batches)': total_loss/50.0})
            ctr = 0
            total_loss = 0
        ctr += 1


Starting Epoch 0


100%|██████████| 8641/8641 [26:14<00:00,  5.49it/s, Average loss(last 50 batches) =132]


Starting Epoch 1


100%|██████████| 8641/8641 [26:29<00:00,  5.44it/s, Average loss(last 50 batches) =97.6]


Starting Epoch 2


100%|██████████| 8641/8641 [26:29<00:00,  5.44it/s, Average loss(last 50 batches) =67.2]


Starting Epoch 3


100%|██████████| 8641/8641 [26:38<00:00,  5.41it/s, Average loss(last 50 batches) =43.6]


Starting Epoch 4


100%|██████████| 8641/8641 [26:40<00:00,  5.40it/s, Average loss(last 50 batches) =31.3]


In [59]:
torch.save(model.state_dict, "first_model.pt")

In [63]:
dev_X = dev_X.tolist()
print(dev_X[:2])
dev_X_split = []
lancaster=LancasterStemmer()
for i in tqdm(range(len(dev_X))):
    dev_X[i] = re.sub(r'\d+', '#', dev_X[i].decode("utf-8") )
    word_tokens = word_tokenize(dev_X[i])
    filtered_sentence = [lancaster.stem(w.lower()) for w in word_tokens if not w.lower() in stop_words and w not in string.punctuation]
    dev_X[i] = " ".join(filtered_sentence)

print(dev_X[:2])

dev_X = vectorizer.transform(dev_X)
dev_Y = ohe.transform(dev_Y.reshape([-1,1]))

[b'Adrenergic activation is thought to be an important determinant of outcome in subjects with chronic heart failure ( CHF ) , but baseline or serial changes in adrenergic activity have not been previously investigated in a large patient sample treated with a powerful antiadrenergic agent .\n', b'Systemic venous norepinephrine was measured at baseline , 3 months , and 12 months in the beta-Blocker Evaluation of Survival Trial ( BEST ) , which compared placebo treatment with the beta-blocker/sympatholytic agent bucindolol .\n']


100%|██████████| 28932/28932 [00:07<00:00, 3742.34it/s]


['adrenerg act thought import determin outcom subject chronic heart fail chf baselin ser chang adrenerg act prevy investig larg paty sampl tre pow antiadrenerg ag', 'system ven norepinephrin meas baselin month month beta-blocker evalu surv tri best comp placebo tre beta-blocker/sympatholytic ag bucindolol']


In [94]:
dev_dataset = SparseDataset(dev_X, dev_Y, "cpu")
dev_dataloader = DataLoader(dataset=dev_dataset,batch_size=256)

In [95]:
results = []
model.to("cpu")
for x,y in tqdm(dev_dataloader):
    y_pred = model(x)
    results.append(y_pred)
results = torch.vstack(results)

100%|██████████| 114/114 [01:01<00:00,  1.85it/s]


In [126]:
max_idx = torch.argmax(results, 1, keepdim=True)
max_idx = torch.stack([torch.arange(28932),max_idx.flatten()]).T
one_hot = torch.FloatTensor(results.shape)
one_hot.zero_()
for i,j in max_idx:
    one_hot[i][j] = 1

In [121]:
print(f1_score(dev_Y, one_hot.numpy(), average = "macro"))
print(accuracy_score(dev_Y, one_hot.numpy()))

0.39119415240920735
0.41255357389741465


In [127]:

print(multilabel_confusion_matrix(dev_Y, one_hot.numpy()))

[[[22847  3510]
  [ 1499  1076]]

 [[18186  6350]
  [ 1355  3041]]

 [[18890   483]
  [ 7207  2352]]

 [[21338  5169]
  [ 1029  1396]]

 [[17471  1484]
  [ 5906  4071]]]
