In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

data_dir = '/content/drive/MyDrive/CSE 576/Project_DialogSystems'
# df_train = pd.read_csv(f"{data_dir}/train.csv")
# df_test = pd.read_csv(f"{data_dir}/test.csv")

In [None]:
import pickle
handle = open(f"{data_dir}/gpt3_conversations.pickle", "rb")
conversations = pickle.load(handle)
handle.close()

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 8.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import torch
import pickle
from torch.utils.data import Dataset

class MDDialDataset(Dataset):
  def __init__(self, save_path, conversations=None, tokenizer=None, use_saved=False, max_length=300):
    self.input_ids = []
    self.attention_masks = []
    self.inputs = conversations

    if use_saved:
      handle = open(f"{save_path}/gpt3_input_ids.pickle", "rb")
      self.input_ids = pickle.load(handle)
      handle.close()
      handle = open(f"{save_path}/gpt3_attention_masks.pickle", "rb")
      self.attention_masks = pickle.load(handle)
      handle.close()
    else:
      for index in range(len(self.inputs)):
        conversation = self.inputs[index]
        if conversation[1] == 1:
          chat = conversation[0]
          complete = [tokenizer.encode(x) + [tokenizer.eos_token_id] for x in chat]
          complete = [item for tokens in complete for item in tokens]
          if len(complete) < max_length:
            for _ in range(len(complete), max_length):
              complete.append(tokenizer.pad_token_id)
          attention = [0 for _ in range(len(complete))]
          attention_len = 0
          for turn in chat:
            tokens = tokenizer.encode(turn) + [tokenizer.eos_token_id]
            self.input_ids.append(torch.tensor(complete))
            attention = attention[:attention_len] + [1 for _ in range(len(tokens))] + attention[attention_len + len(tokens):]
            attention_len += len(tokens)
            self.attention_masks.append(torch.tensor(attention))
      # handle = open(f"{save_path}/input_ids.pickle", "wb")
      handle = open(f"{save_path}/gpt3_input_ids.pickle", "wb")
      pickle.dump(self.input_ids, handle)
      handle.close()
      # handle = open(f"{save_path}/attention_masks.pickle", "wb")
      handle = open(f"{save_path}/gpt3_attention_masks.pickle", "wb")
      pickle.dump(self.attention_masks, handle)
      handle.close()

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_masks[idx]

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2"
special_tokens_dict = {'pad_token': '[PAD]', 'eos_token': '[END]'}
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(special_tokens_dict)
model = GPT2LMHeadModel.from_pretrained(f'{data_dir}/results_gpt2_new/checkpoint-2000').cuda()
# model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [None]:
from torch.utils.data import random_split
torch.manual_seed(49)

#dataset = MDDialDataset(data_dir, conversations=conversations, tokenizer=tokenizer)
dataset = MDDialDataset(data_dir, use_saved=True)
train_size = int(0.9 * len(dataset))
train_dataset, eval_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
# Clear unused memory
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
train_size, len(dataset) - train_size

(399, 45)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir=f'{data_dir}/results_gpt2_new',
  num_train_epochs=20, 
  logging_steps=100, 
  save_steps=2000,
  per_device_train_batch_size=4, 
  per_device_eval_batch_size=4,
  warmup_steps=0,
  weight_decay=0.05, 
  logging_dir='./logs', 
  report_to = 'none'
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([f[0] for f in data]),
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[0] for f in data])
    }
)
trainer.train()

***** Running training *****
  Num examples = 399
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 124441344


Step,Training Loss
100,0.7801
200,0.246
300,0.1398
400,0.0987
500,0.078
600,0.0632
700,0.0552
800,0.0488
900,0.0444
1000,0.04


Step,Training Loss
100,0.7801
200,0.246
300,0.1398
400,0.0987
500,0.078
600,0.0632
700,0.0552
800,0.0488
900,0.0444
1000,0.04


Saving model checkpoint to /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000
Configuration saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results_gpt2_new/checkpoint-2000/added_tokens.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2000, training_loss=0.09483755600452423, metrics={'train_runtime': 784.3242, 'train_samples_per_second': 10.174, 'train_steps_per_second': 2.55, 'total_flos': 1221744384000000.0, 'train_loss': 0.09483755600452423, 'epoch': 20.0})

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2"
special_tokens_dict = {'pad_token': '[PAD]', 'eos_token': '[END]'}
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(special_tokens_dict)
model = GPT2LMHeadModel.from_pretrained(f'{data_dir}/results').cuda()
# model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

generated = tokenizer("Patient: Recently, I am experiencing Burning sensation behind the breastbone, Doctor: ?", return_tensors="pt")

In [None]:
generated = tokenizer("Recently, I am experiencing Burning sensation behind the breastbone[END]", return_tensors="pt")
sample_outputs = model.generate(input_ids=generated['input_ids'].cuda(), max_length=300, pad_token_id=tokenizer.eos_token_id,
        top_p=0.92, top_k = 50)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
tokenizer.decode(sample_outputs[0])

'Recently, I am experiencing Burning sensation behind the breastbone [END] In that case, do you have any Nausea? [END] Not that I know of [END] What about Fever? [END] Well not in my knowledge [END] Oh, do you have any Vomiting? [END] Well not in my knowledge [END] In that case, do you have any Bloating? [END] Yes, sometimes [END] I believe you are having from Esophagitis. [END] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
model.save_pretrained(f'{data_dir}/results')

Configuration saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results/config.json
Model weights saved in /content/drive/MyDrive/CSE 576/Project_DialogSystems/results/pytorch_model.bin


In [None]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import json
from tqdm import tqdm
handle = open(f"{data_dir}/test.json")
data = json.load(handle)
positive = 0
total = 0
diseases = [
    'Esophagitis', 
    'Enteritis', 
    'Asthma', 
    'Coronary heart disease', 
    'Pneumonia', 
    'Rhinitis', 
    'Thyroiditis', 
    'Traumatic brain injury',
    'Dermatitis',
    'External otitis',
    'Conjunctivitis',
    'Mastitis'
]
disease_counts={}
for dialog in tqdm(data):
  disease_gold = data[dialog]['disease_tag']
  if disease_gold not in disease_counts:
    disease_counts[disease_gold] = {
        "positive": 0,
        "total": 0
    }
  prompt = data[dialog]['dialogs'][0]['patient']
  input_seq = tokenizer(f"{prompt}[END]", return_tensors="pt")
  generated = model.generate(
        input_seq['input_ids'].cuda(), max_length=300,
        pad_token_id=tokenizer.eos_token_id,
        top_p=0.92, top_k = 50
    )
  decoded = tokenizer.decode(generated[0]).split("[END] [PAD]")
  for disease in diseases:
    if disease in decoded[0] and disease == disease_gold:
        disease_counts[disease_gold]['positive'] += 1
        positive += 1
  total += 1
  disease_counts[disease_gold]['total'] += 1

positive/total

  0%|          | 0/235 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 1/235 [00:02<11:13,  2.88s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|          | 2/235 [00:05<11:14,  2.89s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|▏         | 3/235 [00:08<11:11,  2.90s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▏         | 4/235 [00:11<11:09,  2.90s/it]A decoder-only architecture is being used, but right-padding was detected! For co

0.6170212765957447

In [8]:
positive, total, positive/total # MDDial Only

(134, 235, 0.5702127659574469)

In [14]:
positive, total, positive/total # MDDial + GPT-3 Generated

(145, 235, 0.6170212765957447)

In [13]:
disease_counts

{'Esophagitis': {'positive': 9, 'total': 27},
 'Enteritis': {'positive': 17, 'total': 24},
 'Asthma': {'positive': 9, 'total': 19},
 'Coronary heart disease': {'positive': 6, 'total': 19},
 'Pneumonia': {'positive': 5, 'total': 20},
 'Rhinitis': {'positive': 8, 'total': 15},
 'Thyroiditis': {'positive': 14, 'total': 19},
 'Traumatic brain injury': {'positive': 13, 'total': 19},
 'Dermatitis': {'positive': 20, 'total': 20},
 'External otitis': {'positive': 15, 'total': 17},
 'Conjunctivitis': {'positive': 18, 'total': 21},
 'Mastitis': {'positive': 11, 'total': 15}}