### Install Required Packages

In [None]:
!pip install rouge_score
!pip install bert-score
!pip install nltk
!pip install python-docx

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
from rouge_score import rouge_scorer
from scipy.spatial.distance import jensenshannon
from difflib import SequenceMatcher
import numpy as np
from docx import Document
from bert_score import score as bert_score
#from moverscore import word_mover_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

In [None]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Example: Inclusion/Exclusion Criteria
Ground truth is pulled from protocolv1.docx

Generated text is created using ChatGPT

In [None]:
# Ground truth and generated criteria (combined into one string each for comparison)
ground_truth = """
Inclusion Criteria:
- Adults aged 18 to 75 years
- Diagnosis of SLE as per ACR/EULAR 2019 classification criteria
- SLEDAI-2K score ≥6 at screening
- Positive for ANA (antinuclear antibodies) or anti-dsDNA at screening
- Receiving stable background therapy for SLE, including corticosteroids
  (≤10 mg/day prednisone or equivalent), antimalarials, and/or immunosuppressants
  for ≥12 weeks
- Willing and able to provide informed consent and comply with study procedures

Exclusion Criteria:
- Active severe lupus nephritis or CNS lupus
- History of severe allergic reactions to monoclonal antibodies
- Active or chronic infections, including tuberculosis, hepatitis B or C, HIV
- Use of biologic therapy within 12 weeks of screening
- Pregnancy or breastfeeding
- Any other medical condition that, in the investigator’s opinion, would
  compromise patient safety or data integrity
"""

generated = """
Inclusion Criteria:
- Age between 18 and 75 years
- Confirmed diagnosis of Systemic Lupus Erythematosus (SLE) according to
  standard criteria
- Moderate to severe disease activity, with SLEDAI-2K score of at least 6
- Positive test for antinuclear antibodies (ANA) or anti-dsDNA
- On stable treatment regimen for lupus for at least 12 weeks
- Ability to provide informed consent

Exclusion Criteria:
- Active lupus affecting the kidneys or central nervous system
- History of allergic reactions to antibody-based therapies
- Ongoing infections such as tuberculosis, hepatitis B/C, or HIV
- Recent use of biologic treatments (within last 3 months)
- Pregnant or nursing women
- Any medical issue that could pose risk or affect study validity
"""

### Calculate ROUGE Score

In [None]:
# ROUGE Score
rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = rouge.score(ground_truth, generated)

In [None]:
print(rouge_scores)

### Calculate METEOR Score

In [None]:
# Tokenize both reference and hypothesis
reference = [word_tokenize(ground_truth)]
hypothesis = word_tokenize(generated)

# Compute METEOR score
score = meteor_score(reference, hypothesis)
print(f"METEOR Score: {score:.4f}")

### Calculate JSD
0 to 1 where 0 is identicle

In [None]:
# Jensen-Shannon Divergence
vectorizer = CountVectorizer().fit([ground_truth, generated])
X = vectorizer.transform([ground_truth, generated]).toarray()
jsd = jensenshannon(X[0], X[1])

In [None]:
print(jsd)

### Calculate Levenshtein Similarity Ratio
0 to 1 where 0 is completely different and 1 is identicle

In [None]:
# Levenshtein Similarity Ratio
lev_ratio = SequenceMatcher(None, ground_truth, generated).ratio()

In [None]:
print(lev_ratio)

### BERT Score

Revisiting inclusion/exclusion example

In [None]:
references = [ground_truth]
candidates = [generated]

# BERTScore
P, R, F1 = bert_score(candidates, references, lang="en", verbose=True)
print(f"BERTScore - Precision: {P.item():.4f}, Recall: {R.item():.4f}, F1: {F1.item():.4f}")


Calculating Bert Score between generated informed consent document (icdv1) and gold standard (Vanderbilt)

In [None]:
# Helper function to extract text from a DOCX file
def read_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text.strip()

# Load the documents
doc1_path = "VanderbiltICD_SLE_Cognitive.docx"
doc2_path = "icdv1.docx"

doc1_text = read_docx(doc1_path)
doc2_text = read_docx(doc2_path)

# Prepare for BERTScore
candidates = [doc1_text]   # usually the generated or predicted text
references = [doc2_text]   # usually the ground truth

# Compute BERTScore
P, R, F1 = bert_score(candidates, references, lang="en", verbose=True)

print(f"\nBERTScore Results:")
print(f"  Precision: {P.item():.4f}")
print(f"  Recall:    {R.item():.4f}")
print(f"  F1 Score:  {F1.item():.4f}")


### ROUGE-L

In [None]:
background_summary = """
Systemic Lupus Erythematosus (SLE) is a chronic autoimmune disease marked by systemic inflammation and multi-organ involvement. While current treatments—such as corticosteroids, antimalarials,
and immunosuppressants—offer some benefit, there remains a significant unmet need for safer, more effective therapies. Ilizomab is a novel monoclonal antibody targeting [specific pathway], which
has shown promise in preclinical models by reducing inflammatory cytokines and autoantibody production.

This Phase 2, multicenter, randomized, double-blind, placebo-controlled trial will evaluate the safety, efficacy, and pharmacokinetics of Ilizomab in approximately 150 adult patients with
moderate to severe SLE. Participants will be randomized in a 2:1 ratio to receive Ilizomab or placebo over a 24-week treatment period, followed by 12 weeks of post-treatment follow-up.

The primary endpoint is the proportion of patients achieving an SRI-4 response at Week 24. Secondary endpoints include changes in SLEDAI-2K scores, tapering of corticosteroids, biomarker
trends (e.g., anti-dsDNA, complement levels), and patient-reported outcomes. Safety assessments include adverse event monitoring, immunogenicity evaluations, and lab testing.

Eligibility criteria require adult patients aged 18–75 with confirmed SLE per ACR/EULAR 2019 criteria and SLEDAI-2K ≥6. Exclusion criteria include severe lupus nephritis or CNS involvement,
active infections, recent biologic therapy, or pregnancy.
"""

# Load and clean text from docx
def read_docx(path):
    doc = Document(path)
    return "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])

# Load the full protocol as source
source_text = read_docx("protocolv1.docx")
# Initialize ROUGE scorer (use stemming to reduce surface variation impact)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(source_text, background_summary)

# Display results
print("🔍 ROUGE Evaluation (using protocol as reference)")
for metric, score in scores.items():
    print(f"{metric.upper()} — Precision: {score.precision:.4f}, Recall: {score.recall:.4f}, F1: {score.fmeasure:.4f}")


### Entailment Metrics

Install required libraries

Note: May require restarting notebook after install

In [None]:
!pip install git+https://github.com/tingofurro/summac.git


In [None]:
!pip install sentencepiece nltk transformers


In [None]:
!pip install --upgrade torch torchvision torchaudio


In [None]:
def read_docx(path):
    doc = Document(path)
    return "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])

# Load protocol and SAP documents
protocol_text = read_docx("protocolv1.docx")
sap_text = read_docx("sapv1.docx")


In [None]:
from summac.model_summac import SummaCConv
import torch

#protocol_text = 'this is also a test'
#sap_text = 'this is a test'
device = "cuda" if torch.cuda.is_available() else "cpu"

# Instantiate the model
model = SummaCConv(models=["vitc"], granularity="sentence", device=device)

# Score the SAP vs protocol
results = model.score([protocol_text], [sap_text])

# Show the overall entailment score
#print(f"SummaC Consistency Score: {results[0]['score']:.4f}")
print(f"SummaC Consistency Score: {results['scores']}")



In [None]:
print(results)

Source = checklist

hypothesis = protocol

generate question from source => is_answer = False

### Install DeepEval

In [None]:
!pip install deepeval

### Set OpenAI API Key (Or other model)

In [None]:
import os
os.environ['OPENAI_API_KEY'] = ''

# Remove Cell

In [None]:
!pip install posthog

In [None]:
from deepeval.metrics import BiasMetric
from deepeval.test_case import LLMTestCase
from deepeval.evaluate import evaluate

"""
# Setup test case
test_case = LLMTestCase(
    input="Bias evaluation",  # Not needed for bias detection
    actual_output=doc_text,
    expected_output="A document free of gender or socioeconomic bias"
)

# Define the metric
bias_metric = BiasMetric()

# Run evaluation
evaluate([test_case], [bias_metric])
"""

### Unbiased Example

In [None]:
# Document to evaluate
doc_text = """
All participants will be required to provide written informed consent prior to
any study-related procedures. The informed consent process will be conducted by
qualified study personnel in a private setting, allowing sufficient time for
participants to review the consent document, ask questions, and discuss
participation with family members or other advisors if desired.

The consent form will describe the purpose of the study, the procedures involved,
the duration of participation, potential risks and benefits, confidentiality of
personal health information, and the voluntary nature of participation.
Participants will be informed that they may withdraw from the study at any time
without penalty or loss of benefits to which they are otherwise entitled.

Special attention will be given to ensuring comprehension among participants
with limited literacy or non-native English speakers. Translated consent forms
will be provided when necessary, and interpreters will be available upon request.
Additional safeguards will be applied for individuals with impaired
decision-making capacity in accordance with institutional and regulatory guidelines.

Documentation of informed consent will be maintained in each participant’s
study file. A copy of the signed consent form will be provided to the participant.
"""

# Setup test case
test_case = LLMTestCase(
    input="Bias evaluation",  # Not needed for bias detection
    actual_output=doc_text,
    expected_output="A document free of gender or socioeconomic bias"
)

# Define the metric
bias_metric = BiasMetric()

# Run evaluation
evaluate([test_case], [bias_metric])

### Biased Example

In [None]:
# Document to evaluate
doc_text = """
Eligible participants will be provided with a consent form to review and sign
prior to enrollment. The form will explain the purpose of the study, procedures
involved, and potential risks and benefits. As most participants are expected
to be working adults with access to transportation and childcare, the study
visits are scheduled during weekday business hours and cannot be adjusted.

Female participants of childbearing age will be required to undergo pregnancy
testing and agree to use approved forms of contraception throughout the study.
Male participants will not be required to modify their behavior, as their
involvement presents minimal reproductive risk.

Compensation will be provided in the form of direct deposit only. Participants
should have a checking account in order to receive payment. Additional resources
or assistance with financial access will not be offered.

Participants are expected to complete all study visits as outlined. Failure to
do so may result in exclusion from the study or withholding of compensation.
"""

# Setup test case
test_case = LLMTestCase(
    input="Bias evaluation",  # Not needed for bias detection
    actual_output=doc_text,
    expected_output="A document free of gender or socioeconomic bias"
)

# Define the metric
bias_metric = BiasMetric()

# Run evaluation
evaluate([test_case], [bias_metric])

### Compliance CheckList

In [None]:
from docx import Document
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from deepeval.metrics.dag import (
    DeepAcyclicGraph,
    TaskNode,
    BinaryJudgementNode,
    VerdictNode,
)

In [None]:
# Step 1: Load your protocol from .docx
def load_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [None]:
protocol_text = load_docx("protocolv1.docx")

# Step 2: Create the test case
test_case = LLMTestCase(
    input="Check if the clinical trial includes the specified sections",
    actual_output=protocol_text,
)

def make_binary_check(criteria_text):
    return BinaryJudgementNode(
        criteria=criteria_text,
        children=[
            VerdictNode(verdict=False, score=0),
            VerdictNode(verdict=True, score=1),
        ]
    )

compliance_checks = [
    make_binary_check("Does the protocol include the trial phase?"),
    make_binary_check("Does the protocol include a study design section?"),
    make_binary_check("Does the protocol include a primary objective section?"),
    make_binary_check("Does the protocol include a secondary objective section?"),
    make_binary_check("Does the protocol include a primary endpoint section?"),
    make_binary_check("Does the protocol include a secondary endpoint section?"),
    make_binary_check("Does the protocol include an inclusion criteria section?"),
    make_binary_check("Does the protocol include an exclusion criteria section?"),
    make_binary_check("Does the protocol include a statistical considerations section?"),
]

compliance_task_node = TaskNode(
    instructions="Check if the clinical trial includes the specified sections",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    output_label="Protocol Content",
    children=compliance_checks,
)

dag = DeepAcyclicGraph(root_nodes=[compliance_task_node])


In [None]:
from deepeval.metrics import DAGMetric

format_correctness = DAGMetric(name="Contains Section", dag=dag, include_reason=True, verbose_mode=True)
format_correctness.measure(test_case)
print(format_correctness.score)

### Summarization

In [None]:
background_summary = """
This Phase 2 clinical trial evaluates Ilizomab, a novel monoclonal antibody,
in adult patients with moderate to severe Systemic Lupus Erythematosus (SLE).
The study is a randomized, double-blind, placebo-controlled trial involving
approximately 150 participants, aiming to assess the safety, efficacy, and
pharmacokinetics of Ilizomab. Ilizomab targets a specific immune pathway
implicated in lupus, showing promise in preclinical studies by modulating
inflammatory cytokines and reducing autoantibody production. The trial spans
24 weeks of treatment followed by 12 weeks of follow-up, with primary outcomes
measured at Week 24 using the SLE Responder Index (SRI-4). Secondary objectives
include evaluating changes in disease activity, biomarker levels, and
patient-reported outcomes. Safety, immunogenicity, and adverse event rates
will be closely monitored throughout the study.
"""


# Load and clean text from docx
def read_docx(path):
    doc = Document(path)
    return "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])

# Load the full protocol as source
source_text = read_docx("protocolv1.docx")

In [None]:
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import SummarizationMetric

# Create a DeepEval test case for the purposes of the evaluation
test_case = LLMTestCase(
  input = source_text,
  actual_output = background_summary
)

# Instantiate the summarization metric
summarization_metric = SummarizationMetric(verbose_mode = True, n = 20, truths_extraction_limit = 20)

# Run the evaluation on the test case
eval_result = evaluate([test_case], [summarization_metric])