In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_dir = '/content/drive/MyDrive/CSE 576/Project_DialogSystems'

In [None]:
import pickle
handle = open(f"{data_dir}/train_conversations.pickle", "rb")
conversations = pickle.load(handle)
handle.close()

In [None]:
len(conversations)

9199

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 68.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import torch
import pickle
from torch.utils.data import Dataset

class MDDialDataset(Dataset):
  def __init__(self, save_path, conversations=None, tokenizer=None, use_saved=False, max_length=300):
    self.input_ids = []
    self.attention_masks = []
    self.inputs = conversations

    if use_saved:
      handle = open(f"{save_path}/input_ids.pickle", "rb")
      self.input_ids = pickle.load(handle)
      handle.close()
      handle = open(f"{save_path}/attention_masks.pickle", "rb")
      self.attention_masks = pickle.load(handle)
      handle.close()
    else:
      for index in range(len(self.inputs)):
        conversation = self.inputs[index]
        if conversation[1] == 1:
          chat = conversation[0]
          complete = [tokenizer.encode(x) + [tokenizer.eos_token_id] for x in chat]
          complete = [item for tokens in complete for item in tokens]
          if len(complete) < max_length:
            for _ in range(len(complete), max_length):
              complete.append(tokenizer.pad_token_id)
          attention = [0 for _ in range(len(complete))]
          attention_len = 0
          for turn in chat:
            tokens = tokenizer.encode(turn) + [tokenizer.eos_token_id]
            self.input_ids.append(torch.tensor(complete))
            attention = attention[:attention_len] + [1 for _ in range(len(tokens))] + attention[attention_len + len(tokens):]
            attention_len += len(tokens)
            self.attention_masks.append(torch.tensor(attention))
      # handle = open(f"{save_path}/input_ids.pickle", "wb")
      handle = open(f"{save_path}/gpt3_input_ids.pickle", "wb")
      pickle.dump(self.input_ids, handle)
      handle.close()
      # handle = open(f"{save_path}/attention_masks.pickle", "wb")
      handle = open(f"{save_path}/gpt3_attention_masks.pickle", "wb")
      pickle.dump(self.attention_masks, handle)
      handle.close()

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_masks[idx]

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "microsoft/DialoGPT-small"
special_tokens_dict = {'pad_token': '[PAD]', 'eos_token': '[END]'}
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(special_tokens_dict)
model = AutoModelForCausalLM.from_pretrained(f"{data_dir}/results_dialo_gpt_with_eos/checkpoint-trained-gpt3-dialogs").cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [None]:
import torch
from torch.utils.data import random_split
torch.manual_seed(49)

dataset = MDDialDataset(data_dir, conversations=conversations, tokenizer=tokenizer)
#dataset = MDDialDataset(data_dir, use_saved=True)
train_size = int(0.9 * len(dataset))
train_dataset, eval_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

NameError: ignored

In [None]:
#tokenizer.decode(train_dataset[0][0][:123])
tokenizer.decode(train_dataset[0][0])

"Recently, I am experiencing Lumbago and Fever[END]Is it? Then do you experience Twitch?[END]Not that I know of[END]In that case, do you have any Rash?[END]What about Suppuration?[END]No, I never had anything like that[END]Is it? Then do you experience Cough?[END]Oh, do you have any Fear of cold?[END]Yes, sometimes[END]I see. Do you have a family history of Asthma?[END]No, I don't think so[END]Is it? Then do you experience Chills?[END]Yes, sometimes[END]Oh, do you have any Cyanosis?[END]What is that?[END]It's a medical condition where your skin turns blue from lack of oxygen in the blood.[END]No, I don't think so[END]Well, based on the symptoms you're experiencing, it's possible that you have Pneumonia.[END][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PA

In [None]:
# Clear unused memory
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
train_size, len(dataset) - train_size

(399, 45)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir=f'{data_dir}/results_dialo_gpt_with_eos',
  num_train_epochs=10, 
  logging_steps=100, 
  save_steps=1000,
  per_device_train_batch_size=4, 
  per_device_eval_batch_size=4,
  warmup_steps=0,
  weight_decay=0.05, 
  logging_dir='./logs', 
  report_to = 'none'
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([f[0] for f in data]),
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[0] for f in data])
    }
)
trainer.train()

PyTorch: setting up devices
***** Running training *****
  Num examples = 399
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
  Number of trainable parameters = 124441344


Step,Training Loss
100,0.0621
200,0.0468
300,0.0379
400,0.0333
500,0.0312
600,0.0287
700,0.0275
800,0.0283
900,0.0282
1000,0.0293


Saving model checkpoint to /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_dialo_gpt_with_eos/checkpoint-1000
Configuration saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_dialo_gpt_with_eos/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_dialo_gpt_with_eos/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_dialo_gpt_with_eos/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_dialo_gpt_with_eos/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.03532235860824585, metrics={'train_runtime': 388.2904, 'train_samples_per_second': 10.276, 'train_steps_per_second': 2.575, 'total_flos': 610872192000000.0, 'train_loss': 0.03532235860824585, 'epoch': 10.0})

In [None]:
disease_encodings = {}
for disease in [
    'Esophagitis', 
    'Enteritis', 
    'Asthma', 
    'Coronary heart disease', 
    'Pneumonia', 
    'Rhinitis', 
    'Thyroiditis', 
    'Traumatic brain injury',
    'Dermatitis',
    'External otitis',
    'Conjunctivitis',
    'Mastitis'
]:
  encoded = tokenizer.encode(disease)
  disease_encodings[disease] = encoded

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList

class DiagnosisStoppingCriteria(StoppingCriteria):
  def __init__(self, diseases, max_length):
    self.diseases = diseases
    self.max_length = max_length

  def isSublist(self, a,b):
    for i in range(len(b)-len(a)+1):
        if b[i:i+len(a)] == a:
            return True
    return False

  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
    if len(input_ids) > self.max_length:
      return True
    else:
      for encoding in self.diseases.values():
        if self.isSublist(encoding, input_ids):
          return True

In [None]:
input_seq = tokenizer("Recently, I am experiencing Burning sensation behind the breastbone[END]", return_tensors="pt")

In [None]:
generated = model.generate(
        input_seq['input_ids'].cuda(),
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        top_p=0.92, top_k = 50,
        stopping_criteria=StoppingCriteriaList([DiagnosisStoppingCriteria(disease_encodings, 1000)])
    )

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
tokenizer.decode(generated[0])

'Recently, I am experiencing Burning sensation behind the breastbone[END]In that case, do you have any Nausea?[END]Well not in my knowledge[END]In that case, do you have any Vomiting?[END]Not that I know of[END]In that case, do you have any Bloating?[END]Well not in my knowledge[END]Oh, do you have any Stomach ache?[END]Yes, sometimes[END]In that case, you have Esophagitis.[END][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD

In [13]:
import json
from tqdm import tqdm
handle = open(f"{data_dir}/test.json")
data = json.load(handle)
positive = 0
total = 0
diseases = [
    'Esophagitis', 
    'Enteritis', 
    'Asthma', 
    'Coronary heart disease', 
    'Pneumonia', 
    'Rhinitis', 
    'Thyroiditis', 
    'Traumatic brain injury',
    'Dermatitis',
    'External otitis',
    'Conjunctivitis',
    'Mastitis'
]
disease_counts={}
for dialog in tqdm(data):
  disease_gold = data[dialog]['disease_tag']
  if disease_gold not in disease_counts:
    disease_counts[disease_gold] = {
        "positive": 0,
        "total": 0
    }
  prompt = data[dialog]['dialogs'][0]['patient']
  input_seq = tokenizer(f"{prompt}[END]", return_tensors="pt")
  generated = model.generate(
        input_seq['input_ids'].cuda(), max_length=300,
        pad_token_id=tokenizer.eos_token_id,
        top_p=0.92, top_k = 50
    )
  decoded = tokenizer.decode(generated[0]).split("[END] [PAD]")
  for disease in diseases:
    if disease in decoded[0] and disease == disease_gold:
        disease_counts[disease_gold]['positive'] += 1
        positive += 1
  total += 1
  disease_counts[disease_gold]['total'] += 1

positive/total

  0%|          | 0/235 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 1/235 [00:02<11:17,  2.90s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|          | 2/235 [00:05<11:21,  2.92s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|▏         | 3/235 [00:08<11:13,  2.90s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▏         | 4/235 [00:11<11:11,  2.91s/it]A decoder-only architecture is being used, but right-padding was detected! For co

0.5531914893617021

In [None]:
accuracy = positive / total
accuracy

0.6042553191489362

In [7]:
positive, total, positive/total # MDDial Only

(145, 235, 0.6170212765957447)

In [11]:
positive, total, positive/total # MDDial + GPT-3 Generated

(130, 235, 0.5531914893617021)

In [8]:
disease_counts # MDDial Only

{'Esophagitis': {'positive': 14, 'total': 27},
 'Enteritis': {'positive': 14, 'total': 24},
 'Asthma': {'positive': 11, 'total': 19},
 'Coronary heart disease': {'positive': 7, 'total': 19},
 'Pneumonia': {'positive': 9, 'total': 20},
 'Rhinitis': {'positive': 6, 'total': 15},
 'Thyroiditis': {'positive': 9, 'total': 19},
 'Traumatic brain injury': {'positive': 14, 'total': 19},
 'Dermatitis': {'positive': 20, 'total': 20},
 'External otitis': {'positive': 14, 'total': 17},
 'Conjunctivitis': {'positive': 16, 'total': 21},
 'Mastitis': {'positive': 11, 'total': 15}}

In [12]:
disease_counts # MDDial + GPT-3 Generated

{'Esophagitis': {'positive': 7, 'total': 27},
 'Enteritis': {'positive': 17, 'total': 24},
 'Asthma': {'positive': 9, 'total': 19},
 'Coronary heart disease': {'positive': 8, 'total': 19},
 'Pneumonia': {'positive': 6, 'total': 20},
 'Rhinitis': {'positive': 10, 'total': 15},
 'Thyroiditis': {'positive': 7, 'total': 19},
 'Traumatic brain injury': {'positive': 14, 'total': 19},
 'Dermatitis': {'positive': 18, 'total': 20},
 'External otitis': {'positive': 12, 'total': 17},
 'Conjunctivitis': {'positive': 15, 'total': 21},
 'Mastitis': {'positive': 7, 'total': 15}}

In [None]:
diseases = [
    'Esophagitis', 
    'Enteritis', 
    'Asthma', 
    'Coronary heart disease', 
    'Pneumonia', 
    'Rhinitis', 
    'Thyroiditis', 
    'Traumatic brain injury',
    'Dermatitis',
    'External otitis',
    'Conjunctivitis',
    'Mastitis'
]
for idx, conv in enumerate(conversations):
  if conv[1] == 1:
    print(conv[0][0], conv[0][-1])

Recently, I am experiencing Burning sensation behind the breastbone Ok, this means you might be having Esophagitis.
Hi Doctor, I am having Chest tightness In that case, you have Esophagitis.
I have Nausea and Diarrhea Ok, this means you might be having Esophagitis.
Hi Doctor, I am having Chest tightness In that case, you have Esophagitis.
Hi Doctor, I am having Chest tightness In that case, you have Esophagitis.
I have been feeling Hard to swallow In that case, you have Esophagitis.
I have been feeling Expectoration and Chest tightness This could probably be Esophagitis.
I have been feeling Burning sensation behind the breastbone and Bloating I believe you are having Esophagitis.
Recently, I am experiencing Acid reflux In that case, you have Esophagitis.
I have been feeling Acid reflux and Burning sensation behind the breastbone In that case, you have Esophagitis.
I have been feeling Acid reflux and Cough This could probably be Esophagitis.
Hi Doctor, I am having Acid reflux I believe 