# Scorers Unit Testing

### Imports

In [1]:
import sys
import json
from typing import List, Dict, Callable, Any, Union

### Setup
Ensure HuggingFace access to any restricted model via HuggingFace API.

In [2]:
# Relative pathing for import
sys.path.append('../')
from scorers import (
    calculate_bertscore,
    fluency_score,
    semantic_similarity,
    nli_score,
    explanation_accuracy_score,
    claim_support_score,
    evidence_selection_score,
    coherence_score,
    fact_verification_score,
    explanation_completeness_score,
    evidence_relevance_score,
    claim_evidence_alignment_score
)

  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
# Load test data
with open('scorers_unit_test.json', 'r') as f:
    test_data: Dict[str, List[Dict[str, Any]]] = json.load(f)

### Run tests

In [4]:
def run_tests(metric_name: str, metric_function: Callable) -> None:
    """
    Run tests for a given metric function.

    Args:
        metric_name (str): Name of the metric being tested.
        metric_function (Callable): The metric function to test.
    """
    print(f"Testing {metric_name}")
    print("-" * 40)
    
    for i, test_case in enumerate(test_data[metric_name], 1):
        print(f"Test case {i}:")
        
        # Print input
        for key, value in test_case.items():
            if key != 'expected_score':
                print(f"{key}:")
                print(f"{value}")
                print()
        
        # Run metric function
        result = execute_metric_function(metric_name, metric_function, test_case)
        
        # Print result
        print("Result:")
        print(result)
        print(f"Expected score: {test_case['expected_score']}")
        print("\n" + "=" * 40 + "\n")


def execute_metric_function(metric_name: str, metric_function: Callable, test_case: Dict[str, Any]) -> Union[float, Dict[str, float]]:
    """
    Execute the appropriate metric function based on the metric name.

    Args:
        metric_name (str): Name of the metric being tested.
        metric_function (Callable): The metric function to execute.
        test_case (Dict[str, Any]): The test case data.

    Returns:
        Union[float, Dict[str, float]]: The result of the metric function.
    """
    if metric_name == 'calculate_bertscore':
        return metric_function(test_case['candidate'], test_case['reference'])
    elif metric_name in ['fluency_score', 'coherence_score']:
        return metric_function(test_case['generated_explanation'])
    elif metric_name == 'semantic_similarity':
        return metric_function(test_case['text1'], test_case['text2'])
    elif metric_name == 'nli_score':
        return metric_function(test_case['premise'], test_case['hypothesis'])
    elif metric_name in ['explanation_accuracy_score', 'explanation_completeness_score']:
        return metric_function(test_case['generated_explanation'], test_case['golden_explanation'])
    elif metric_name == 'claim_support_score':
        return metric_function(test_case['claim'], test_case['generated_explanation'])
    elif metric_name == 'evidence_selection_score':
        return metric_function(test_case['predicted_evidence_ids'], test_case['golden_evidence_ids'])
    elif metric_name == 'fact_verification_score':
        return metric_function(test_case['generated_explanation'], test_case['evidence_texts'])
    elif metric_name in ['evidence_relevance_score', 'claim_evidence_alignment_score']:
        return metric_function(test_case['claim'], test_case['selected_evidence'])
    else:
        raise ValueError(f"Unknown metric: {metric_name}")

In [5]:
# Test calculate_bertscore
run_tests('calculate_bertscore', calculate_bertscore)

Testing calculate_bertscore
----------------------------------------
Test case 1:
candidate:
EGFR mutations predict response to EGFR TKIs in lung cancer.

reference:
EGFR mutations indicate sensitivity to EGFR tyrosine kinase inhibitors in lung cancer.

Result:
{'Precision': 0.8548259735107422, 'Recall': 0.8211377859115601, 'F1': 0.8376433253288269}
Expected score: Highest


Test case 2:
candidate:
Some genetic changes in lung cancer can guide treatment choices.

reference:
EGFR mutations indicate sensitivity to EGFR tyrosine kinase inhibitors in lung cancer.

Result:
{'Precision': 0.6754329204559326, 'Recall': 0.5660730600357056, 'F1': 0.6159364581108093}
Expected score: Medium


Test case 3:
candidate:
Chemotherapy is a common treatment for many types of cancer.

reference:
EGFR mutations indicate sensitivity to EGFR tyrosine kinase inhibitors in lung cancer.

Result:
{'Precision': 0.570564329624176, 'Recall': 0.49837350845336914, 'F1': 0.532031238079071}
Expected score: Lowest




In [6]:
# Test fluency_score
run_tests('fluency_score', fluency_score)

Testing fluency_score
----------------------------------------
Test case 1:
generated_explanation:
BRCA mutations increase cancer risk by impairing DNA repair. This makes cells more susceptible to genomic instability and malignant transformation.

Result:
0.7809737014770508
Expected score: High


Test case 2:
generated_explanation:
BRCA mutations make cancer more likely. They affect how cells fix DNA. This can lead to cancer.

Result:
0.6634459686279297
Expected score: Medium


Test case 3:
generated_explanation:
BRCA bad. DNA break. Cancer happen.

Result:
0
Expected score: Low




In [7]:
# Test semantic_similarity
run_tests('semantic_similarity', semantic_similarity)

Testing semantic_similarity
----------------------------------------
Test case 1:
text1:
HER2 amplification is a predictive biomarker for trastuzumab in breast cancer.

text2:
HER2 overexpression indicates likely response to HER2-targeted therapies in breast cancer patients.

Result:
0.8255205154418945
Expected score: Highest


Test case 2:
text1:
HER2 amplification is a predictive biomarker for trastuzumab in breast cancer.

text2:
Some genetic changes in breast cancer can help guide treatment decisions.

Result:
0.5296799540519714
Expected score: Medium


Test case 3:
text1:
HER2 amplification is a predictive biomarker for trastuzumab in breast cancer.

text2:
Regular mammograms are important for early detection of breast cancer.

Result:
0.316358357667923
Expected score: Lowest




In [8]:
# Test nli_score
run_tests('nli_score', nli_score)

Testing nli_score
----------------------------------------
Test case 1:
premise:
KRAS mutations are associated with resistance to EGFR inhibitors in colorectal cancer.

hypothesis:
Patients with KRAS-mutant colorectal cancer are unlikely to benefit from cetuximab treatment.

Result:
2.0350900292396545
Expected score: Highest


Test case 2:
premise:
KRAS mutations are associated with resistance to EGFR inhibitors in colorectal cancer.

hypothesis:
Genetic testing is important in colorectal cancer treatment planning.

Result:
0.8370858430862427
Expected score: Medium


Test case 3:
premise:
KRAS mutations are associated with resistance to EGFR inhibitors in colorectal cancer.

hypothesis:
All colorectal cancer patients should receive EGFR inhibitors as first-line treatment.

Result:
0.5540798902511597
Expected score: Lowest




In [9]:
# Test explanation_accuracy_score
run_tests('explanation_accuracy_score', explanation_accuracy_score)

Testing explanation_accuracy_score
----------------------------------------
Test case 1:
generated_explanation:
PD-L1 expression correlates with response to PD-1/PD-L1 inhibitors.

golden_explanation:
PD-L1 expression levels are associated with increased likelihood of response to PD-1/PD-L1 checkpoint inhibitors.

Result:
0.914983868598938
Expected score: Highest


Test case 2:
generated_explanation:
PD-L1 is a protein that affects how some cancer treatments work.

golden_explanation:
PD-L1 expression levels are associated with increased likelihood of response to PD-1/PD-L1 checkpoint inhibitors.

Result:
0.7500367760658264
Expected score: Medium


Test case 3:
generated_explanation:
Chemotherapy kills rapidly dividing cells.

golden_explanation:
PD-L1 expression levels are associated with increased likelihood of response to PD-1/PD-L1 checkpoint inhibitors.

Result:
0.15234696865081787
Expected score: Lowest




In [10]:
# Test claim_support_score
run_tests('claim_support_score', claim_support_score)

Testing claim_support_score
----------------------------------------
Test case 1:
claim:
ALK inhibitors are effective in ALK-rearranged NSCLC.

generated_explanation:
ALK inhibitors specifically target the aberrant ALK fusion proteins, leading to tumor cell death in ALK-rearranged NSCLC.

Result:
0.8235062956809998
Expected score: Highest


Test case 2:
claim:
ALK inhibitors are effective in ALK-rearranged NSCLC.

generated_explanation:
Targeted therapies can be effective in some types of lung cancer with specific genetic changes.

Result:
0.390979528427124
Expected score: Medium


Test case 3:
claim:
ALK inhibitors are effective in ALK-rearranged NSCLC.

generated_explanation:
Smoking is a major risk factor for developing lung cancer.

Result:
0.14656774699687958
Expected score: Lowest




In [11]:
# Test evidence_selection_score
run_tests('evidence_selection_score', evidence_selection_score)

Testing evidence_selection_score
----------------------------------------
Test case 1:
predicted_evidence_ids:
[1, 2, 3, 4]

golden_evidence_ids:
[1, 2, 3, 4]

Result:
{'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0}
Expected score: Highest


Test case 2:
predicted_evidence_ids:
[1, 2, 3, 5]

golden_evidence_ids:
[1, 2, 3, 4]

Result:
{'Precision': 0.75, 'Recall': 1.0, 'F1': 0.8571428571428571}
Expected score: Medium


Test case 3:
predicted_evidence_ids:
[5, 6, 7, 8]

golden_evidence_ids:
[1, 2, 3, 4]

Result:
{'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0}
Expected score: Lowest




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Test coherence_score
run_tests('coherence_score', coherence_score)

Testing coherence_score
----------------------------------------
Test case 1:
generated_explanation:
BRAF V600E mutations activate the MAPK pathway. This leads to uncontrolled cell growth. BRAF inhibitors can block this aberrant signaling.

Result:
0.9444719851016998
Expected score: High


Test case 2:
generated_explanation:
BRAF mutations occur in melanoma. They affect cell signaling. Some drugs can target these mutations.

Result:
0.9068260490894318
Expected score: Medium


Test case 3:
generated_explanation:
BRAF important in cancer. Drugs exist. Melanoma treatment changing. Genetic testing helps.

Result:
1.0926899413267772
Expected score: Low




In [13]:
# Test fact_verification_score
run_tests('fact_verification_score', fact_verification_score)

Testing fact_verification_score
----------------------------------------
Test case 1:
generated_explanation:
BRCA mutations increase risk of breast and ovarian cancers.

evidence_texts:
['BRCA1 and BRCA2 mutations are associated with higher risk of breast and ovarian cancers.', 'Individuals with BRCA mutations often undergo increased cancer screening.']

Result:
1.6268236935138702
Expected score: High


Test case 2:
generated_explanation:
BRCA mutations increase risk of breast and ovarian cancers.

evidence_texts:
['Genetic factors can influence cancer risk.', 'Regular check-ups are important for early cancer detection.']

Result:
0.8902662098407745
Expected score: Medium


Test case 3:
generated_explanation:
BRCA mutations increase risk of breast and ovarian cancers.

evidence_texts:
['Smoking is a major cause of lung cancer.', 'Exercise can help reduce the risk of some cancers.']

Result:
0.8996986448764801
Expected score: Low




In [14]:
# Test explanation_completeness_score
run_tests('explanation_completeness_score', explanation_completeness_score)

Testing explanation_completeness_score
----------------------------------------
Test case 1:
generated_explanation:
MSI-H status in colorectal cancer predicts response to immunotherapy. It indicates defective DNA mismatch repair. This leads to increased neoantigen load and T-cell infiltration.

golden_explanation:
Microsatellite instability-high (MSI-H) status in colorectal cancer is predictive of response to immune checkpoint inhibitors. MSI-H tumors have defective DNA mismatch repair, resulting in increased mutational burden and neoantigen production. This promotes T-cell infiltration and susceptibility to immunotherapy.

Result:
0.7010677059491476
Expected score: High


Test case 2:
generated_explanation:
MSI-H status in colorectal cancer relates to immunotherapy response. It affects how the immune system interacts with the tumor.

golden_explanation:
Microsatellite instability-high (MSI-H) status in colorectal cancer is predictive of response to immune checkpoint inhibitors. MSI-H 

In [15]:
# Test evidence_relevance_score
run_tests('evidence_relevance_score', evidence_relevance_score)

Testing evidence_relevance_score
----------------------------------------
Test case 1:
claim:
PARP inhibitors are effective in BRCA-mutated ovarian cancers.

selected_evidence:
['PARP inhibitors exploit synthetic lethality in BRCA-deficient cells.', 'Clinical trials show improved progression-free survival with PARP inhibitors in BRCA-mutated ovarian cancer.']

Result:
0.8759736120700836
Expected score: High


Test case 2:
claim:
PARP inhibitors are effective in BRCA-mutated ovarian cancers.

selected_evidence:
['Some targeted therapies show promise in ovarian cancer treatment.', 'Genetic testing is becoming more common in cancer diagnosis.']

Result:
0.48335570096969604
Expected score: Medium


Test case 3:
claim:
PARP inhibitors are effective in BRCA-mutated ovarian cancers.

selected_evidence:
["Regular pelvic exams are important for women's health.", 'Chemotherapy remains a standard treatment for many cancers.']

Result:
0.2987018823623657
Expected score: Low




In [16]:
# Test claim_evidence_alignment_score
run_tests('claim_evidence_alignment_score', claim_evidence_alignment_score)

Testing claim_evidence_alignment_score
----------------------------------------
Test case 1:
claim:
IDH mutations in glioma lead to 2-HG accumulation.

selected_evidence:
['IDH1/2 mutations result in neomorphic enzyme activity, producing 2-HG.', 'IDH mutantations in gliomas lead to accumulation of 2-HG levels.']

Result:
1.5359254479408264
Expected score: High


Test case 2:
claim:
IDH mutations in glioma lead to 2-HG accumulation.

selected_evidence:
['IDH mutations can be found in specific types of tumors.', 'Genetic alterations sometimes affect metabolism.']

Result:
0.924556702375412
Expected score: Medium


Test case 3:
claim:
IDH mutations in glioma lead to 2-HG accumulation.

selected_evidence:
['Gliomas are a type of brain cancer.', 'Mutations factors rarely contribute to cancer development.']

Result:
-0.9262970387935638
Expected score: Low


