In [3]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [4]:
df = pd.read_csv("/home/careinfolab/Dr_Luo/Rohan/ICD_Codes/Dataset/icd10-codes-and-descriptions/Codes&Desc_cleaned.csv")

In [5]:
df.describe()

Unnamed: 0,ICD_Code,Description
count,71704,71704
unique,71704,71704
top,A00.0,cholera due to vibrio cholerae 01 biovar cholerae
freq,1,1


In [6]:
df.head(10)

Unnamed: 0,ICD_Code,Description
0,A00.0,cholera due to vibrio cholerae 01 biovar cholerae
1,A00.1,cholera due to vibrio cholerae 01 biovar eltor
2,A00.9,cholera unspecified
3,A01.00,typhoid fever unspecified typhoid fever
4,A01.01,typhoid meningitis typhoid fever
5,A01.02,typhoid fever with heart involvement typhoid f...
6,A01.03,typhoid pneumonia typhoid fever
7,A01.04,typhoid arthritis typhoid fever
8,A01.05,typhoid osteomyelitis typhoid fever
9,A01.09,typhoid fever with other complications typhoid...


In [7]:
corpus = (df['ICD_Code'] + " " + df['Description']).tolist()

with open("corpus.txt", "w", encoding="utf-8") as f:
    for line in corpus:
        f.write(line + "\n")


In [8]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[SOS]", "[EOS]"])

tokenizer.train(["corpus.txt"], trainer)

tokenizer.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    pair="[SOS] $A [EOS] [SOS] $B [EOS]",
    special_tokens=[
        ("[SOS]", tokenizer.token_to_id("[SOS]")),
        ("[EOS]", tokenizer.token_to_id("[EOS]")),
    ],
)






In [9]:
def tokenize_text(text):
    return tokenizer.encode(text)

df['code_tokens'] = df['ICD_Code'].apply(lambda x: tokenize_text(x).ids)
df['desc_tokens'] = df['Description'].apply(lambda x: tokenize_text(x).ids)


In [10]:
df.iloc[0]

ICD_Code                                                   A00.0
Description    cholera due to vibrio cholerae 01 biovar cholerae
code_tokens                                  [2, 13998, 4, 5, 3]
desc_tokens    [2, 15427, 302, 136, 12988, 15428, 353, 15921,...
Name: 0, dtype: object

In [11]:
df.iloc[0].to_list()[3]

[2, 15427, 302, 136, 12988, 15428, 353, 15921, 15428, 3]

In [42]:
from torch.nn.utils.rnn import pad_sequence
import torch

def pad_batch(batch):
    tensor_batch = [torch.tensor(seq) for seq in batch]
    return pad_sequence(tensor_batch, batch_first=True, padding_value=tokenizer.token_to_id("[PAD]"))


In [43]:
code_padded = pad_batch(df['code_tokens'].tolist())
desc_padded = pad_batch(df['desc_tokens'].tolist())

In [44]:
print(f"The total count of ICD Codes={len(code_padded)}")
print(f"The total count of Code Descriptions={len(desc_padded)}")

The total count of ICD Codes=71704
The total count of Code Descriptions=71704


In [45]:
print(f"The tokens in each ICD Codes={len(code_padded[3])}")
print(f"The tokens in each Code Descriptions={len(desc_padded[3])}")

The tokens in each ICD Codes=5
The tokens in each Code Descriptions=60


In [46]:
df.iloc[6]

ICD_Code                                A01.03
Description    typhoid pneumonia typhoid fever
code_tokens              [2, 8278, 4, 1209, 3]
desc_tokens     [2, 7741, 2821, 7741, 3267, 3]
Name: 6, dtype: object

In [47]:
desc_padded[6]

tensor([   2, 7741, 2821, 7741, 3267,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [50]:
code_padded[6]

tensor([   2, 8278,    4, 1209,    3])

In [16]:
decoded1 = tokenizer.decode(desc_padded[6].tolist())
decoded2 = tokenizer.decode(code_padded[6].tolist())
print(decoded1)
print(decoded2)

typhoid pneumonia typhoid fever
A01 . 03


In [17]:
tokenizer.save("bpe_tokenizer.json")

from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")


In [18]:
df

Unnamed: 0,ICD_Code,Description,code_tokens,desc_tokens
0,A00.0,cholera due to vibrio cholerae 01 biovar cholerae,"[2, 13998, 4, 5, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
1,A00.1,cholera due to vibrio cholerae 01 biovar eltor,"[2, 13998, 4, 6, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
2,A00.9,cholera unspecified,"[2, 13998, 4, 14, 3]","[2, 15427, 97, 3]"
3,A01.00,typhoid fever unspecified typhoid fever,"[2, 8278, 4, 606, 3]","[2, 7741, 3267, 97, 7741, 3267, 3]"
4,A01.01,typhoid meningitis typhoid fever,"[2, 8278, 4, 353, 3]","[2, 7741, 4503, 7741, 3267, 3]"
...,...,...,...,...
71699,Z99.12,encounter for respirator ventilator dependence...,"[2, 10685, 4, 276, 3]","[2, 103, 139, 9590, 10800, 1474, 1501, 1181, 2..."
71700,Z99.2,dependence on renal dialysis dependence on ren...,"[2, 10685, 4, 7, 3]","[2, 1474, 70, 1312, 2676, 1474, 70, 1312, 2676..."
71701,Z99.3,dependence on wheelchair dependence on wheelchair,"[2, 10685, 4, 8, 3]","[2, 1474, 70, 6083, 1474, 70, 6083, 3]"
71702,Z99.81,dependence on supplemental oxygen dependence o...,"[2, 10685, 4, 636, 3]","[2, 1474, 70, 21645, 6971, 1474, 70, 119, 1547..."


In [19]:
df1 = df.copy()

In [20]:
df1

Unnamed: 0,ICD_Code,Description,code_tokens,desc_tokens
0,A00.0,cholera due to vibrio cholerae 01 biovar cholerae,"[2, 13998, 4, 5, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
1,A00.1,cholera due to vibrio cholerae 01 biovar eltor,"[2, 13998, 4, 6, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
2,A00.9,cholera unspecified,"[2, 13998, 4, 14, 3]","[2, 15427, 97, 3]"
3,A01.00,typhoid fever unspecified typhoid fever,"[2, 8278, 4, 606, 3]","[2, 7741, 3267, 97, 7741, 3267, 3]"
4,A01.01,typhoid meningitis typhoid fever,"[2, 8278, 4, 353, 3]","[2, 7741, 4503, 7741, 3267, 3]"
...,...,...,...,...
71699,Z99.12,encounter for respirator ventilator dependence...,"[2, 10685, 4, 276, 3]","[2, 103, 139, 9590, 10800, 1474, 1501, 1181, 2..."
71700,Z99.2,dependence on renal dialysis dependence on ren...,"[2, 10685, 4, 7, 3]","[2, 1474, 70, 1312, 2676, 1474, 70, 1312, 2676..."
71701,Z99.3,dependence on wheelchair dependence on wheelchair,"[2, 10685, 4, 8, 3]","[2, 1474, 70, 6083, 1474, 70, 6083, 3]"
71702,Z99.81,dependence on supplemental oxygen dependence o...,"[2, 10685, 4, 636, 3]","[2, 1474, 70, 21645, 6971, 1474, 70, 119, 1547..."


In [21]:
df1['code_padded'] = [list(row) for row in code_padded]
df1['desc_padded'] = [list(row) for row in desc_padded]
df1 = df1.drop(columns=['ICD_Code','Description','code_tokens','desc_tokens'])

In [17]:
df.head(10)

Unnamed: 0,ICD_Code,Description,code_tokens,desc_tokens
0,A00.0,cholera due to vibrio cholerae 01 biovar cholerae,"[2, 13998, 4, 5, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
1,A00.1,cholera due to vibrio cholerae 01 biovar eltor,"[2, 13998, 4, 6, 3]","[2, 15427, 302, 136, 12988, 15428, 353, 15921,..."
2,A00.9,cholera unspecified,"[2, 13998, 4, 14, 3]","[2, 15427, 97, 3]"
3,A01.00,typhoid fever unspecified typhoid fever,"[2, 8278, 4, 606, 3]","[2, 7741, 3267, 97, 7741, 3267, 3]"
4,A01.01,typhoid meningitis typhoid fever,"[2, 8278, 4, 353, 3]","[2, 7741, 4503, 7741, 3267, 3]"
5,A01.02,typhoid fever with heart involvement typhoid f...,"[2, 8278, 4, 301, 3]","[2, 7741, 3267, 110, 1184, 1806, 7741, 3267, 3]"
6,A01.03,typhoid pneumonia typhoid fever,"[2, 8278, 4, 1209, 3]","[2, 7741, 2821, 7741, 3267, 3]"
7,A01.04,typhoid arthritis typhoid fever,"[2, 8278, 4, 1171, 3]","[2, 7741, 729, 7741, 3267, 3]"
8,A01.05,typhoid osteomyelitis typhoid fever,"[2, 8278, 4, 991, 3]","[2, 7741, 1378, 7741, 3267, 3]"
9,A01.09,typhoid fever with other complications typhoid...,"[2, 8278, 4, 501, 3]","[2, 7741, 3267, 110, 119, 932, 7741, 3267, 3]"


In [25]:
type(df1['code_padded'].iloc[0])

list

In [97]:
df1.to_csv("/home/careinfolab/Dr_Luo/Rohan/ICD_Codes/Dataset/icd10-codes-and-descriptions/Tokens.csv")

In [3]:
import torch

In [44]:
rnn = torch.load('/home/careinfolab/Dr_Luo/Rohan/ICD_Codes/Results/LSTM/checkpoints/best_model.pt')

  rnn = torch.load('/home/careinfolab/Dr_Luo/Rohan/ICD_Codes/Results/LSTM/checkpoints/best_model.pt')


In [45]:
print(rnn)

SeqModel(
  (embedding): Embedding(25932, 174)
  (rnn): LSTM(
    (lstm): LSTM(174, 63, batch_first=True)
    (h2o): Linear(in_features=63, out_features=25248, bias=True)
  )
)


In [46]:
input_tensor = torch.tensor([   2, 7741, 2821, 7741, 3267,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [47]:
input_tensor.shape

torch.Size([60])

In [48]:
input_tensor = input_tensor.unsqueeze(0).to('cuda')  # Shape becomes [1, 60]

In [None]:
input_tensor = input_tensor.to('cuda')

batch_size = input_tensor.size(0)

hidden = rnn.rnn.init_zero_hidden(batch_size)

rnn.eval()
with torch.no_grad():
    preds = rnn(input_tensor, hidden)

print(preds.shape)


torch.Size([1, 60, 25248])


In [50]:
preds.argmax(dim=2)

tensor([[   2, 8278,    4,    6,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3]],
       device='cuda:0')

In [51]:
pred_logits = preds[:, :5, :]
predictions = pred_logits.argmax(dim=2)

In [52]:
pred_logits

tensor([[[-0.4132, -0.4132, 20.8666,  ..., -0.4132, -0.4324, -0.3989],
         [-0.2012, -0.2012,  2.4396,  ..., -0.2012,  0.5271,  0.4974],
         [-1.2904, -1.2904, -0.6422,  ..., -1.2904, -0.2593, -0.2817],
         [-1.2113, -1.2113,  2.9291,  ..., -1.2113, -0.3049, -0.3084],
         [-0.7764, -0.7764,  7.8817,  ..., -0.7764,  0.0397,  0.0469]]],
       device='cuda:0')

ICD_Code                                A01.03
Description    typhoid pneumonia typhoid fever
code_tokens              [2, 8278, 4, 1209, 3]
desc_tokens     [2, 7741, 2821, 7741, 3267, 3]
Name: 6, dtype: object

In [53]:
predictions

tensor([[   2, 8278,    4,    6,    3]], device='cuda:0')

In [54]:
original = torch.tensor([2, 8278, 4, 1209, 3])
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")
tokenizer.decode(original.tolist())

'A01 . 03'

In [55]:
tokenizer.decode(predictions[0].tolist())

'A01 . 1'

## ICD to Desc

In [12]:
from torch.nn.utils.rnn import pad_sequence
import torch

def pad_batch(batch, max_length=None):
    tensor_batch = [torch.tensor(seq) for seq in batch]
    if max_length is None:
        return pad_sequence(tensor_batch, batch_first=True, padding_value=tokenizer.token_to_id("[PAD]"))
    else:
        padded = pad_sequence(tensor_batch, batch_first=True, padding_value=tokenizer.token_to_id("[PAD]"))
        if padded.size(1) < max_length:
            pad_size = max_length - padded.size(1)
            pad_tensor = torch.full((padded.size(0), pad_size), tokenizer.token_to_id("[PAD]"))
            padded = torch.cat([padded, pad_tensor], dim=1)
        return padded

desc_padded = pad_batch(df['desc_tokens'].tolist())
max_len = desc_padded.size(1)

code_padded = pad_batch(df['code_tokens'].tolist(), max_length=max_len)


In [13]:
print(f"The tokens in each ICD Codes={len(code_padded[3])}")
print(f"The tokens in each Code Descriptions={len(desc_padded[3])}")

The tokens in each ICD Codes=60
The tokens in each Code Descriptions=60


In [14]:
decoded1 = tokenizer.decode(desc_padded[6].tolist())
decoded2 = tokenizer.decode(code_padded[6].tolist())
print(decoded1)
print(decoded2)

typhoid pneumonia typhoid fever
A01 . 03


In [15]:
df2 = df.copy()

In [16]:
df2['code_padded'] = [list(row) for row in code_padded]
df2['desc_padded'] = [list(row) for row in desc_padded]
df2 = df2.drop(columns=['ICD_Code','Description','code_tokens','desc_tokens'])

In [18]:
df2

Unnamed: 0,code_padded,desc_padded
0,"[tensor(2), tensor(13998), tensor(4), tensor(5...","[tensor(2), tensor(15427), tensor(302), tensor..."
1,"[tensor(2), tensor(13998), tensor(4), tensor(6...","[tensor(2), tensor(15427), tensor(302), tensor..."
2,"[tensor(2), tensor(13998), tensor(4), tensor(1...","[tensor(2), tensor(15427), tensor(97), tensor(..."
3,"[tensor(2), tensor(8278), tensor(4), tensor(60...","[tensor(2), tensor(7741), tensor(3267), tensor..."
4,"[tensor(2), tensor(8278), tensor(4), tensor(35...","[tensor(2), tensor(7741), tensor(4503), tensor..."
...,...,...
71699,"[tensor(2), tensor(10685), tensor(4), tensor(2...","[tensor(2), tensor(103), tensor(139), tensor(9..."
71700,"[tensor(2), tensor(10685), tensor(4), tensor(7...","[tensor(2), tensor(1474), tensor(70), tensor(1..."
71701,"[tensor(2), tensor(10685), tensor(4), tensor(8...","[tensor(2), tensor(1474), tensor(70), tensor(6..."
71702,"[tensor(2), tensor(10685), tensor(4), tensor(6...","[tensor(2), tensor(1474), tensor(70), tensor(2..."


In [19]:
df2.to_csv("/home/careinfolab/Dr_Luo/Rohan/ICD_Codes/Dataset/icd10-codes-and-descriptions/Tokens_ICD2Desc.csv")