# **INSTALLATIONS**

In [34]:
! pip install -q transformers[torch] datasets
!pip install evaluate rouge_score



In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

# **LOADING DATASET**

In [4]:
dataset = load_dataset('csv', data_files={'data': '/content/drive/MyDrive/clinical-reports-summarizer-dataset.csv'})

Generating data split: 0 examples [00:00, ? examples/s]

In [5]:
train_data, val_data = train_test_split(dataset['data'], test_size=0.2, random_state=42)

In [6]:
dataset

DatasetDict({
    data: Dataset({
        features: ['TEXT', 'SUMMARY'],
        num_rows: 1074
    })
})

In [7]:
type(train_data)

dict

In [8]:
train_data = pd.DataFrame(train_data)
val_data = pd.DataFrame(val_data)

In [9]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [10]:
dataset = DatasetDict({
    'train': train_dataset,
    'test': val_dataset
})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'SUMMARY'],
        num_rows: 859
    })
    test: Dataset({
        features: ['TEXT', 'SUMMARY'],
        num_rows: 215
    })
})

In [12]:
example = dataset["train"][0]
for key in example:
    print("KEY: \"{}\"".format(key))
    print("VAL corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))
    print("\n")

KEY: "TEXT"
VAL corresponding to the key-"TEXT"
 "Clinical History/Diagnosis: Left lung CA. Source of Specimen(s). A: Portion of 6th rib. B: Lymph Node, level 9. C: Lymph Node, level 11. D: Left lung. E: Lymph Node, level 5. F: Lymph Node, level 7. Gross Description: The specimen is received in six parts. Source of Tissue: 1. Labeled "portion 6th rib". Gross Description: Received fresh and consists of a segment of rib. measuring 2 x 1.4 X 1cm. No gross suspicious lesions are found. It is. submitted entirely in 1A following decalcification. Designation of Sections: 1A. Summary of Sections: @. Source of Tissue: 2. Labeled "level 9 lymph node". Gross Description: Received fresh and consists of a single grayish-black. lymph node measuring 0.9 x 0.7 x 0.7cm. It is submitted entirely in 2A. Designation of Sections: 2A. Summary of Sections: @. Source of Tissue: 3. Labeled "level 11 lymph node". Gross Description: Received fresh and consists of an irregular, ragged. fragment of black tissue me

# **TOKENIZATION AND FINETUNING**

In [13]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [14]:
print(example)

{'TEXT': 'Clinical History/Diagnosis: Left lung CA. Source of Specimen(s). A: Portion of 6th rib. B: Lymph Node, level 9. C: Lymph Node, level 11. D: Left lung. E: Lymph Node, level 5. F: Lymph Node, level 7. Gross Description: The specimen is received in six parts. Source of Tissue: 1. Labeled "portion 6th rib". Gross Description: Received fresh and consists of a segment of rib. measuring 2 x 1.4 X 1cm. No gross suspicious lesions are found. It is. submitted entirely in 1A following decalcification. Designation of Sections: 1A. Summary of Sections: @. Source of Tissue: 2. Labeled "level 9 lymph node". Gross Description: Received fresh and consists of a single grayish-black. lymph node measuring 0.9 x 0.7 x 0.7cm. It is submitted entirely in 2A. Designation of Sections: 2A. Summary of Sections: @. Source of Tissue: 3. Labeled "level 11 lymph node". Gross Description: Received fresh and consists of an irregular, ragged. fragment of black tissue measuring 1.5 x 1 x 0.7cm. It is submitted

In [15]:
tokenized_text = tokenizer(example['TEXT'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 512). Running this sequence through the model will result in indexing errors


input_ids
[14067, 5528, 87, 23770, 6715, 7, 159, 10, 14298, 5084, 3087, 5, 9149, 13, 3, 7727, 23, 904, 599, 7, 137, 71, 10, 9731, 1575, 13, 431, 189, 3, 6520, 5, 272, 10, 5225, 7656, 465, 221, 6, 593, 5835, 205, 10, 5225, 7656, 465, 221, 6, 593, 7806, 309, 10, 14298, 5084, 5, 262, 10, 5225, 7656, 465, 221, 6, 593, 3594, 377, 10, 5225, 7656, 465, 221, 6, 593, 4306, 17969, 7726, 10, 37, 19622, 19, 1204, 16, 1296, 1467, 5, 9149, 13, 332, 13159, 10, 1300, 16229, 15, 26, 96, 31656, 431, 189, 3, 6520, 1280, 17969, 7726, 10, 24083, 26, 1434, 11, 3, 6848, 13, 3, 9, 5508, 13, 3, 6520, 5, 11297, 204, 3, 226, 3, 14912, 3, 4, 209, 75, 51, 5, 465, 8690, 21641, 110, 2865, 33, 435, 5, 94, 19, 5, 5776, 4585, 16, 209, 188, 826, 20, 10379, 2420, 5, 1642, 257, 13, 5568, 7, 10, 209, 188, 5, 20698, 13, 5568, 7, 10, 3320, 5, 9149, 13, 332, 13159, 10, 1682, 16229, 15, 26, 96, 4563, 668, 25049, 150, 221, 1280, 17969, 7726, 10, 24083, 26, 1434, 11, 3, 6848, 13, 3, 9, 712, 9954, 1273, 18, 19699, 5, 25049, 150, 

In [16]:
def preprocess_function(examples):
  inputs = ["summarize: " + doc for doc in examples["TEXT"]]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True)
  labels = tokenizer(text_target=examples["SUMMARY"], max_length=512, truncation=True)
  model_inputs["labels"] = labels["input_ids"]

  return model_inputs

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset['test'][0]['TEXT']

'Clinical History/Diagnosis: Not entered. Source of Specimen(s): A: CYTOLOGY. B: L11 LYMPH NODES. C: CYTOLOGY. D: LEFT LOWER LOBE. E: BRONCHIAL L11 NODES. F: NODE @ RESECTION MARGIN. G: LEVEL 7 LN. H: PULMONARY ARTERY LEVEL 5 LN. Gross Description: GROSS DESCRIPTION: Received in eight parts. SOURCE OF TISSUE: 1. CYTOLOGY. SOURCE OF TISSUE: 2. Labeled #2, "L11 lymph nodes". GROSS DESCRIPTION: Received fresh are three gray-black, anthracotic-stained tissue. fragments, 0.4 to 0.8 cm. in greatest dimension. They are submitted in toto in one block. DESIGNATION OF SECTIONS: Block 2. SUMMARY OF SECTIONS: undesignated-3. SOURCE OF TISSUE: 3. CYTOLOGY. SOURCE OF TISSUE: 4. Labeled #4, "left lower lobe". FROZEN SECTION DIAGNOSIS: 4FSA-4FSB - RESERVE CELL HYPERPLASIA. NO. TUMOR SEEN. GROSS DESCRIPTION: Received fresh for frozen section evaluation is a 235 gram, 12.5 X. 10.5 X 4.5 cm. left lower lobectomy of lung. It is covered by a pink to red-purple, focally. anthracotic-stained pleura. There is

In [19]:
tokenized_dataset['test'][0]['SUMMARY']

"The patient's condition was diagnosed as squamous cell carcinoma located in the left lower lobe of the lung. The tumor measured 5 cm and extended partially into the pleura but not through it. Examination of the bronchial and vascular resection margins revealed no tumor involvement. Three lymph nodes around the bronchus and four lymph nodes from the L11 area were all negative for tumor. A lymph node at the resection margin, as well as two level 7 lymph nodes and one pulmonary artery level 5 lymph node, were also found to be negative for tumor."

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

In [22]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="clinical-reports-summarizer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.757287
2,No log,2.527283
3,No log,2.459255
4,No log,2.439344


TrainOutput(global_step=216, training_loss=3.107117264359086, metrics={'train_runtime': 104.1118, 'train_samples_per_second': 33.003, 'train_steps_per_second': 2.075, 'total_flos': 465034429857792.0, 'train_loss': 3.107117264359086, 'epoch': 4.0})

In [26]:
trainer.save_model("clinical-reports-summarizer")

In [27]:
text = dataset['test'][100]['TEXT']
text = "summarize: " + text
text

'summarize: Surgery date: REVISED REPORT (Addendum/Procedure included). DIAGNOSIS: Site Pancreas head C25.0. A. Liver, right lobe, biopsy: Scant liver parenchyma with capsular. fibrosis. Rare cluster of atypical cells of uncertain significance. B. Lymph node, common hepatic, biopsy: Involved by metastatic. adenocarcinoma. C. Common hepatic duct, margin, excision: Negative for tumor. D. Pancreatic body, margin, excision: Intraductal papillary. mucinous neoplasm with high-grade dysplasia, present in the. pancreatic duct. No invasive carcinoma. E. Portal vein, adventitia, biopsy: Negative for tumor. F. Head pancreas, gallbladder, duodenum, common bile duct, portion. jejunum, Whipple resection: Invasive moderately differentiated. ductal adenocarcinoma on a background of extensive intraductal. papillary mucinous neoplasm with high-grade dysplasia, forming a. solid 4.6 x 2.3 x 2.0 cm mass located in the pancreatic head. The. tumor extends beyond pancreas to involve the peripancreatic soft. t

In [28]:
from transformers import pipeline

summarizer = pipeline("summarization", model="clinical-reports-summarizer")
pred = summarizer(text)
pred

Token indices sequence length is longer than the specified maximum sequence length for this model (1473 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'a biopsy of the pancreatic duct was performed on the right lobe liver bx . the biopsy revealed a rare cluster of atypical cells with capsular fibrosis . The biopsy was performed in the adenocarcinoma. The tumor is 0.1 cm from the portal vein groove. The patient has had no prior treatment.'}]

# **ROGUE SCORES**

In [30]:
import evaluate
rouge = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [33]:
preds = [pred[0]['summary_text']]

labels = [dataset['test'][100]['SUMMARY']]

rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.3448275862068965,
 'rouge2': 0.09790209790209792,
 'rougeL': 0.23448275862068965,
 'rougeLsum': 0.23448275862068965}

### **ROGUE SCORE OBTAINED IS:**
  **ROGUE1**: 0.3448275862068965 <br>
  **ROGUE2**: 0.09790209790209792<br>
  **ROGUEL**: 0.23448275862068965<br>
  **ROUGELsum**: 0.23448275862068965