In [9]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
nltk.download('punkt')

def preprocess_text(text):
    preprocessed_text = text.lower()
    return preprocessed_text
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def compute_tfidf(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    return tfidf_matrix

In [12]:
def rank_sentences(tfidf_matrix):
    sentence_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return sentence_similarity

In [13]:
def generate_summary(sentences, sentence_similarity, top_n=5):
    ranked_sentences = ((sentence_similarity[i][j], i, j) for i in range(len(sentence_similarity)) for j in range(len(sentence_similarity[0])))
    selected_sentences = []
    for top_sentence in sorted(ranked_sentences, key=lambda x: x[0])[-top_n:]:
        sentence_index = top_sentence[1]
        selected_sentences.append(sentences[sentence_index])
    for sentence in selected_sentences:
        print(sentence)
    summary = ' '.join(selected_sentences)
    return summary

In [14]:
text = "Patient: John DoeDate: June 15, 2023Chief Complaint: Severe chest pain and difficulty breathing.History of Present Illness:The patient, a 45-year-old male, presented to the emergency department with complaints of sudden onset severe chest pain and difficulty breathing. He describes the pain as a crushing sensation in the center of his chest that radiates to his left arm. The symptoms started approximately two hours ago while he was resting at home. He denies any associated symptoms such as nausea, vomiting, or diaphoresis. No known triggering factors.Past Medical History:The patient has a history of hypertension and hyperlipidemia. He takes medications for both conditions regularly. No history of previous heart attacks, strokes, or other cardiovascular events. No known drug allergies.Social History:The patient is a nonsmoker and denies any alcohol or illicit drug use. He works as an office manager and reports moderate stress levels.Family History:His father had a heart attack at the age of 65. No other significant family history of cardiovascular diseases.Physical Examination:Vital signs: Blood pressure 140/90 mmHg, heart rate 90 bpm, respiratory rate 18 breaths per minute, oxygen saturation 98% on room air.General appearance: The patient appears anxious and is in moderate distress due to chest pain.Cardiovascular: Regular rate and rhythm, no murmurs or gallops, normal S1 and S2 sounds.Respiratory: Decreased breath sounds on the left lung base.Abdominal: Soft and non-tender, no organomegaly.Neurological: Alert and oriented, no focal neurological deficits.Diagnostic Tests:Electrocardiogram (ECG): Shows ST-segment elevation in leads II, III, and aVF suggestive of inferior wall myocardial infarction.Troponin levels: Elevated, consistent with myocardial injury.Assessment:The patient presents with symptoms and ECG findings indicative of an acute myocardial infarction, specifically involving the inferior wall.Plan:The patient will be admitted to the coronary care unit for further management. Treatment will include oxygen supplementation, aspirin, nitroglycerin, and statin therapy. A coronary angiogram will be performed to assess the extent of coronary artery disease and determine the need for percutaneous coronary intervention (PCI) or coronary artery bypass graft (CABG) surgery."

preprocessed_text = preprocess_text(text)
sentences = tokenize_sentences(preprocessed_text)
tfidf_matrix = compute_tfidf(sentences)
sentence_similarity = rank_sentences(tfidf_matrix)

summary = generate_summary(sentences, sentence_similarity)

word_count = len(summary.split())
print('\n')
print("Word count in summary:", word_count)


no known triggering factors.past medical history:the patient has a history of hypertension and hyperlipidemia.
he takes medications for both conditions regularly.
a coronary angiogram will be performed to assess the extent of coronary artery disease and determine the need for percutaneous coronary intervention (pci) or coronary artery bypass graft (cabg) surgery.
patient: john doedate: june 15, 2023chief complaint: severe chest pain and difficulty breathing.history of present illness:the patient, a 45-year-old male, presented to the emergency department with complaints of sudden onset severe chest pain and difficulty breathing.
no known drug allergies.social history:the patient is a nonsmoker and denies any alcohol or illicit drug use.


Word count in summary: 104


In [15]:
word_count = len(text.split())
print("Word count in medical record:", word_count)

Word count in medical record: 321


In [17]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

nltk.download('punkt')

def preprocess_text(text):
    preprocessed_text = text.lower()
    return preprocessed_text

def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

def compute_tfidf(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    return tfidf_matrix

def rank_sentences(tfidf_matrix):
    sentence_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return sentence_similarity

def generate_summary(sentences, sentence_similarity, top_n=5):
    ranked_sentences = ((sentence_similarity[i][j], i, j) for i in range(len(sentence_similarity)) for j in range(len(sentence_similarity[0])))
    selected_sentences = []
    for top_sentence in sorted(ranked_sentences, key=lambda x: x[0])[-top_n:]:
        sentence_index = top_sentence[1]
        selected_sentences.append(sentences[sentence_index])
    for sentence in selected_sentences:
        print(sentence)
    summary = ' '.join(selected_sentences)
    return summary

def process_medical_records(file_paths):
    for file_path in file_paths:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            medical_record_text = ''
            for page in pdf_reader.pages:
                medical_record_text += page.extract_text()
            preprocessed_text = preprocess_text(medical_record_text)
            sentences = tokenize_sentences(preprocessed_text)
            tfidf_matrix = compute_tfidf(sentences)
            sentence_similarity = rank_sentences(tfidf_matrix)
            summary = generate_summary(sentences, sentence_similarity)
            print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
pdf_file_paths = ['medical_record1.pdf', 'medical_record2.pdf', 'medical_record3.pdf','medical_record4.pdf']

process_medical_records(pdf_file_paths)

physical examination: neurological examination reveals no focal deﬁcits.
investigations: cognitive screening tests, such as the mini-mental state examination (mmse), show mild cognitive impairment.
brain imaging (e.g., mri) may be performed to rule out other causes.
treatment plan: the patient will be referred to a neurologist or geriatrician for further evaluation and management.
medications to manage symptoms and slow disease progression, such as cholinesterase inhibitors, may be prescribed.

the patient reports pain, stiﬀness, and swelling in both knees, especially after prolonged periods of activity or in the morning.
medical history: the patient has a history of osteoarthritis.
physical examination: joint examination reveals tenderness, crepitus, and limited range of motion in the aﬀected knees.
surgical options, such as knee replacement, may be discussed if conservative measures are ineﬀective.
investigations: x-rays show joint space narrowing and osteophyte formation in the knee

In [3]:
import os
import pdfplumber
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

def process_medical_records(pdf_file_paths):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    for file_path in pdf_file_paths:
        text = extract_text(file_path)
        sentences = tokenize_sentences(text)

        if len(sentences) > 0:
            summary = generate_summary(sentences, tokenizer, model, device)
            print("Medical Record:", file_path)
            print("Summary:", summary)
            print()

def extract_text(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        return text

def tokenize_sentences(text):
    sentences = text.split('. ')
    return sentences

def generate_summary(sentences, tokenizer, model, device, top_n=5):
    encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    sentence_embeddings = outputs.last_hidden_state[:, 0, :]
    similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)

    ranked_sentences = sorted(((similarity_matrix[i][j], i, j) for j in range(len(similarity_matrix[0])) for i in range(len(similarity_matrix))), reverse=True)

    selected_sentences = []
    for _, i, j in ranked_sentences:
        if sentences[i] not in selected_sentences:
            selected_sentences.append(sentences[i])
            if len(selected_sentences) == top_n:
                break

    summary = '. '.join(selected_sentences)
    return summary

In [4]:
pdf_file_paths = ['medical_record1.pdf', 'medical_record2.pdf', 'medical_record3.pdf']
process_medical_records(pdf_file_paths)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Medical Record: medical_record1.pdf
Summary: Cognitive stimulation activities and support for the
patient and family will be provided. Brain imaging (e.g., MRI) may be performed to rule out other
causes.
Diagnosis: The patient is diagnosed with mild cognitive impairment, possibly due to Alzheimer's
disease.
Treatment Plan: The patient will be referred to a neurologist or geriatrician for further evaluation
and management. Regular follow-up appointments will be scheduled to assess
cognitive function and adjust treatment as needed.. No other abnormal
findings.
Investigations: Cognitive screening tests, such as the Mini-Mental State Examination (MMSE),
show mild cognitive impairment. Family members have noticed personality changes and
withdrawal from social activities.
Medical History: No significant medical history

Medical Record: medical_record2.pdf
Summary: Intra-articular
injections, such as corticosteroids or hyaluronic acid, may be considered. No other abnormal findings.
Investigat