In [None]:
!pip install PyMuPDF
!pip install scispacy
!pip install transformers
!pip install spacy
!pip install sentence-transformers

In [None]:
# Import necessary libraries
import fitz  # PyMuPDF for PDF processing
import spacy
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

In [None]:
# ---- STEP 1: PDF Text Extraction ----

In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using PyMuPDF (fitz).
    
    :param pdf_path: Path to the PDF file
    :return: Extracted text as a string
    """
    text = ""
    try:
        doc = fitz.open(pdf_path)  # Open the PDF file
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)  # Load each page
            text += page.get_text("text")  # Extract text from the page
        doc.close()
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    return text

In [None]:
# ---- STEP 2: Medical Named Entity Recognition (NER) ----
# Load the SciSpacy model for NER

In [None]:
nlp = spacy.load("en_core_sci_md")  # You can replace with 'en_ner_bc5cdr_md' for better medical entity recognition

def extract_medical_entities(text):
    """
    Extracts medical entities (diseases, symptoms, medications) from text using SciSpacy NER model.
    
    :param text: Input text (medical records)
    :return: List of extracted entities
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ['DISEASE', 'SYMPTOM', 'DRUG']:  # Adjust for relevant entity types
            entities.append((ent.text, ent.label_))
    return entities

In [None]:
# ---- STEP 3: Summarization of Medical Text ----
# Summarization pipeline from Hugging Face

In [None]:
def summarize_medical_text(text, max_length=200):
    """
    Summarizes long medical text using a transformer-based model.
    
    :param text: Input medical text
    :param max_length: Maximum length of the summary
    :return: Summarized text
    """
    # Hugging Face models require shorter text chunks, so we will split if necessary
    text_chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]  # Splitting long text into 1000 token chunks
    summary = ""
    for chunk in text_chunks:
        summary += summarizer(chunk, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']
    return summary

In [None]:
# ---- STEP 4: Relevance Matching ----
# Load pre-trained sentence embedding model

In [None]:
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def match_relevant_info_to_case(medical_summary, court_case_prompt):
    """
    Matches the medical summary to the court case prompt based on text similarity.
    
    :param medical_summary: Summarized medical information
    :param court_case_prompt: Court case description or prompt
    :return: Relevance score (cosine similarity)
    """
    # Get sentence embeddings for both the medical summary and the court case prompt
    summary_embedding = embedding_model.encode(medical_summary, convert_to_tensor=True)
    prompt_embedding = embedding_model.encode(court_case_prompt, convert_to_tensor=True)
    
    # Compute cosine similarity between the two embeddings
    similarity_score = util.pytorch_cos_sim(summary_embedding, prompt_embedding)
    return similarity_score.item()  # Convert to a scalar value

In [None]:
# ---- STEP 5: Putting it All Together ----

In [None]:
def process_medical_pdf_for_court_case(pdf_path, court_case_prompt):
    """
    Main function to process a medical PDF for relevant information related to a court case.
    
    :param pdf_path: Path to the medical PDF
    :param court_case_prompt: Court case prompt describing the relevant details
    :return: Final summarized and relevant information
    """
    # Step 1: Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Extract medical entities from the text
    medical_entities = extract_medical_entities(pdf_text)
    
    # Step 3: Summarize the extracted medical text
    summarized_text = summarize_medical_text(pdf_text)
    
    # Step 4: Match the summarized medical data to the court case prompt
    relevance_score = match_relevant_info_to_case(summarized_text, court_case_prompt)

    # Final output
    return {
        "summarized_text": summarized_text,
        "medical_entities": medical_entities,
        "relevance_score": relevance_score
    }


In [None]:
# ---- Example Usage ----
pdf_path = "path_to_medical_pdf.pdf"  # Replace with the path to the medical PDF you want to analyze
court_case_prompt = "The patient is involved in a car accident case and is suffering from multiple fractures and PTSD."

# Run the processing function
result = process_medical_pdf_for_court_case(pdf_path, court_case_prompt)

# Print results
print("Summarized Medical Information:")
print(result['summarized_text'])
print("\nExtracted Medical Entities:")
print(result['medical_entities'])
print("\nRelevance Score to the Court Case:")
print(result['relevance_score'])