In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import torch
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to C:\Users\Rimjhim
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rimjhim
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Rimjhim
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dataset = pd.read_excel("dataset\MEDICSUMM.xlsx")


In [3]:
dataset

Unnamed: 0,REPORT,SUMMARY
0,Thyroid Stimulating Hormone (TSH): 2.5 mIU/L (...,TSH: 2.5 mIU/L (Normal) - Normal thyroid stimu...
1,Thyroid Stimulating Hormone (TSH): 0.1 mIU/L (...,TSH: 0.1 mIU/L (Low) - Low thyroid stimulating...
2,Hemoglobin (Hb): 13.0 g/dL (Low)\nRed Blood Ce...,Hemoglobin (Hb): 13.0 g/dL (Low) - Indicates l...
3,Sodium (Na): 140 mEq/L (Normal)\nPotassium (K)...,Sodium: 140 mEq/L (Normal)\nPotassium: 4.0 mEq...
4,Albumin: 4.5 g/dL (Normal)\nTotal Protein: 7.0...,Albumin: 4.5 g/dL (Normal)\nTotal Protein: 7.0...
5,Total Cholesterol: 200 mg/dL (Borderline High)...,Total Cholesterol: 200 mg/dL (Borderline High)...
6,Thyroid Stimulating Hormone (TSH): 2.5 mIU/L (...,TSH: 2.5 mIU/L (Normal) - Normal thyroid stimu...
7,Troponin I: 0.02 ng/mL (Normal)\nCreatine Kina...,Troponin I: 0.02 ng/mL (Normal) - Normal tropo...
8,HIV: Negative\nSyphilis: Negative\nChlamydia: ...,HIV: Negative - No evidence of HIV infection.\...
9,Prothrombin Time (PT): 12 seconds (Normal)\nAc...,PT: 12 seconds (Normal) - Normal prothrombin t...


In [4]:
df_sg = pd.read_excel("dataset/CMS32_DESC_LONG_SHORT_SG.xlsx")
df_dx = pd.read_excel("dataset/CMS32_DESC_LONG_SHORT_DX.xlsx")


In [5]:
difficult_words_sg = df_sg.set_index('SHORT DESCRIPTION')['LONG DESCRIPTION'].to_dict()
difficult_words_dx = df_dx.set_index('SHORT DESCRIPTION')['LONG DESCRIPTION'].to_dict()

In [6]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text


In [7]:
def replace_difficult_words(text, difficult_words):
    for word, explanation in difficult_words.items():
        text = text.replace(word, explanation)
    return text

In [8]:
def preprocess_reports(reports):
    preprocessed_reports = []
    for report in reports:
        # Preprocess text
        report = preprocess_text(report)
        # Replace difficult words with explanations from both datasets
        report = replace_difficult_words(report,difficult_words_sg)
        report = replace_difficult_words(report, difficult_words_dx)
        preprocessed_reports.append(report)
    return preprocessed_reports


In [9]:
dataset['processed_report'] = preprocess_reports(dataset['REPORT'])
dataset['processed_summary'] = preprocess_reports(dataset['SUMMARY'])


In [10]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [11]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Encoding inputs and outputs
train_inputs = tokenizer(train_data['processed_report'].tolist(), padding=True, truncation=True, return_tensors="pt")
train_outputs = tokenizer(train_data['processed_summary'].tolist(), padding=True, truncation=True, return_tensors="pt")
test_inputs = tokenizer(test_data['processed_report'].tolist(), padding=True, truncation=True, return_tensors="pt")
test_outputs = tokenizer(test_data['processed_summary'].tolist(), padding=True, truncation=True, return_tensors="pt")


In [13]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
# Model training
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(3):  
    optimizer.zero_grad()
    outputs = model(input_ids=train_inputs['input_ids'], labels=train_outputs['input_ids'])
    loss = outputs.loss
    loss.backward()
    optimizer.step()



In [15]:
print(train_inputs['input_ids'])

tensor([[ 7102,   208,  2841,     3,     7,    63, 18118,   159,  2841,     3,
           524,   521,  2258,    26,    23,     9,  2841,     3,  5307,   127,
            52,    88,     9,  2841,  8868,  2841,     3,  2248,   794,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  792, 16462,  5453,    26,    40,  4947,   747,   306,     3,   107,
            26,    40, 16462,  5453,    26,    40,  1389,     3,    40,    26,


In [16]:
# Model evaluation
model.eval()
with torch.no_grad():
    # Generate outputs
    test_outputs_predicted = model.generate(input_ids=test_inputs['input_ids'], max_length=150)
    
    # Convert test_outputs_predicted to list of token IDs if necessary
    if isinstance(test_outputs_predicted, torch.Tensor):
        test_outputs_predicted = test_outputs_predicted.tolist()
    elif isinstance(test_outputs_predicted[0], torch.Tensor):
        test_outputs_predicted = [tensor.tolist() for tensor in test_outputs_predicted]

# Decode predictions
decoded_predictions = []
for token_ids in test_outputs_predicted:
    decoded_tokens = tokenizer.decode(token_ids, skip_special_tokens=True)
    decoded_predictions.append(decoded_tokens)

print(decoded_predictions)


['serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test normal dheasulfate serum test', 'Prothrombin time pt second normal activated partial thromboplastin time aptt second normal international normalized ratio inr normal interpretation normal coagulation panel', 'thyroid stimulating hormone thyroid stimulating hormone tsh miul normal free thyroxine ngdl normal total triiodothyronine ngdl normal interpretation normal thyroid panel']


In [17]:
def generate_summary(input_text):
    # Preprocess the input text
    preprocessed_input = preprocess_text(input_text)

    # Tokenization
    input_tokenized = tokenizer(preprocessed_input, return_tensors="pt")

    # Model inference
    model.eval()
    with torch.no_grad():
        output_ids = model.generate(input_tokenized['input_ids'], max_length=50, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

In [18]:
MEDICAL_DOCUMENT = """ 
duplications of the alimentary tract are well - known but rare congenital malformations that can occur anywhere in the gastrointestinal ( gi ) tract from the tongue to the anus . while midgut duplications are the most common , foregut duplications such as oesophagus , stomach , and parts 1 and 2 of the duodenum account for approximately one - third of cases . 
 they are most commonly seen either in the thorax or abdomen or in both as congenital thoracoabdominal duplications . 
 cystic oesophageal duplication ( ced ) , the most common presentation , is often found in the lower third part ( 60 - 95% ) and on the right side [ 2 , 3 ] . hydatid cyst ( hc ) is still an important health problem throughout the world , particularly in latin america , africa , and mediterranean areas . 
 turkey , located in the mediterranean area , shares this problem , with an estimated incidence of 20/100 000 . 
 most commonly reported effected organ is liver , but in children the lungs are the second most frequent site of involvement [ 4 , 5 ] . in both ced and hc , the presentation depends on the site and the size of the cyst . 
 hydatid cysts are far more common than other cystic intrathoracic lesions , especially in endemic areas , so it is a challenge to differentiate ced from hc in these countries . here , 
 we present a 7-year - old girl with intrathoracic cystic mass lesion , who had been treated for hydatid cyst for 9 months , but who turned out to have oesophageal cystic duplication . 
 a 7-year - old girl was referred to our clinic with coincidentally established cystic intrathoracic lesion during the investigation of aetiology of anaemia . 
 the child was first admitted with loss of vision in another hospital ten months previously . 
 the patient 's complaints had been attributed to pseudotumour cerebri due to severe iron deficiency anaemia ( haemoglobin : 3 g / dl ) . 
 chest radiography and computed tomography ( ct ) images resulted in a diagnosis of cystic intrathoracic lesion ( fig . 
 the cystic mass was accepted as a type 1 hydatid cyst according to world health organization ( who ) classification . 
 after 9 months of medication , no regression was detected in ct images , so the patient was referred to our department . 
 an ondirect haemagglutination test result was again negative . during surgery , after left thoracotomy incision , a semi - mobile cystic lesion , which was almost seven centimetres in diameter , with smooth contour , was found above the diaphragm , below the lung , outside the pleura ( fig . 
 the entire fluid in the cyst was aspirated ; it was brown and bloody ( fig . 
 2 ) . the diagnosis of cystic oesophageal duplication was considered , and so an attachment point was searched for . 
 it was below the hiatus , on the lower third left side of the oesophagus , and it also was excised completely through the hiatus . 
 pathologic analysis of the specimen showed oesophageal mucosa with an underlying proper smooth muscle layer . 
 computed tomography image of the cystic intrathoracic lesion cystic lesion with brownish fluid in the cyst 
 compressible organs facilitate the growth of the cyst , and this has been proposed as a reason for the apparent prevalence of lung involvement in children . diagnosis is often incidental and can be made with serological tests and imaging [ 5 , 7 ] . 
 laboratory investigations include the casoni and weinberg skin tests , indirect haemagglutination test , elisa , and the presence of eosinophilia , but can be falsely negative because children may have a poor serological response to eg . 
 false - positive reactions are related to the antigenic commonality among cestodes and conversely seronegativity can not exclude hydatidosis . 
 false - negative results are observed when cysts are calcified , even if fertile [ 4 , 8 ] . in our patient iha levels were negative twice . 
 due to the relatively non - specific clinical signs , diagnosis can only be made confidently using appropriate imaging . 
 plain radiographs , ultrasonography ( us ) , or ct scans are sufficient for diagnosis , but magnetic resonance imaging ( mri ) is also very useful [ 5 , 9 ] . 
 computed tomography demonstrates cyst wall calcification , infection , peritoneal seeding , bone involvement fluid density of intact cysts , and the characteristic internal structure of both uncomplicated and ruptured cysts [ 5 , 9 ] . 
 the conventional treatment of hydatid cysts in all organs is surgical . in children , small hydatid cysts of the lungs 
 respond favourably to medical treatment with oral administration of certain antihelminthic drugs such as albendazole in certain selected patients . 
 the response to therapy differs according to age , cyst size , cyst structure ( presence of daughter cysts inside the mother cysts and thickness of the pericystic capsule allowing penetration of the drugs ) , and localization of the cyst . in children , small cysts with thin pericystic capsule localised in the brain and lungs respond favourably [ 6 , 11 ] . 
 respiratory symptoms are seen predominantly in cases before two years of age . in our patient , who has vision loss , the asymptomatic duplication cyst was found incidentally . 
 the lesion occupied the left hemithorax although the most common localisation reported in the literature is the lower and right oesophagus . 
 the presentation depends on the site and the size of the malformations , varying from dysphagia and respiratory distress to a lump and perforation or bleeding into the intestine , but cysts are mostly diagnosed incidentally . 
 if a cystic mass is suspected in the chest , the best technique for evaluation is ct . 
 magnetic resonance imaging can be used to detail the intimate nature of the cyst with the spinal canal . 
 duplications should have all three typical signs : first of all , they should be attached to at least one point of the alimentary tract ; second and third are that they should have a well - developed smooth muscle coat , and the epithelial lining of duplication should represent some portions of alimentary tract , respectively [ 2 , 10 , 12 ] . in summary , the cystic appearance of both can cause a misdiagnosis very easily due to the rarity of cystic oesophageal duplications as well as the higher incidence of hydatid cyst , especially in endemic areas . 
"""

In [22]:
from joblib import dump, load
dump(model, 'summaryModel.joblib')


loaded_model = load('summaryModel.joblib')

In [20]:
summary=generate_summary(MEDICAL_DOCUMENT)
print(summary)

oesophageal duplication ced hc presentation depends site size cyst hydatid cyst far common cystic intrathoracic lesion especially endemic area challenge differentiated hc country
