In [1]:
!huggingface-cli login

Traceback (most recent call last):
  File "/Users/rateria/anaconda3/envs/cs-5787/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/rateria/anaconda3/envs/cs-5787/lib/python3.9/site-packages/huggingface_hub/commands/huggingface_cli.py", line 57, in main
    service.run()
  File "/Users/rateria/anaconda3/envs/cs-5787/lib/python3.9/site-packages/huggingface_hub/commands/user.py", line 153, in run
    login(
  File "/Users/rateria/anaconda3/envs/cs-5787/lib/python3.9/site-packages/huggingface_hub/_login.py", line 123, in login
    interpreter_login(new_session=new_session, write_permission=write_permission)
  File "/Users/rateria/anaconda3/envs/cs-5787/lib/python3.9/site-packages/huggingface_hub/_login.py", line 275, in interpreter_login
    token = getpass("Enter your token (input will not be visible): ")
  File "/Users/rateria/anaconda3/envs/cs-5787/lib/python3.9/getpass.py", line 77, in unix_getpass
    passwd = _raw_input(prompt, stream, input=input)
  File 

In [None]:
!pip install sacremoses

In [None]:
!wget -r -N -c -np --user harshini3003 --ask-password https://physionet.org/files/me-llama/1.0.0

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.models import Transformer, Pooling
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ndcg_score
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"


csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")


models_dict = {
    "PubMedBERT": "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb",
    "SciBERT": "pritamdeka/S-Scibert-snli-multinli-stsb",
    "BioBERT": "pritamdeka/S-BioBert-snli-multinli-stsb",
    "BioGPT": "microsoft/biogpt",
    "BlueBERT": "pritamdeka/S-Bluebert-snli-multinli-stsb"
}

# Me-LLaMA
def generate_llama_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    return embedding

\
def get_top_k(evidence_embedding, comparison_embeddings, comparison_texts, k=3):
    similarities = util.pytorch_cos_sim(evidence_embedding, comparison_embeddings).squeeze().tolist()
    top_k_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]
    return [{"Text": comparison_texts[i], "Score": similarities[i]} for i in top_k_indices]

def compute_ndcg(truth, predictions, k=5):
    if len(np.array(predictions).shape) > 1:
        predictions = np.array(predictions).squeeze()
    if len(np.array(truth).shape) > 1:
        truth = np.array(truth).squeeze()
    return ndcg_score([truth], [predictions], k=k)

# Test Function
def test_model(models, test_cases, top_k=3):
    test_results = {}
    top_k_results = {}
    ndcg_results = []

    for model_name, model_path in tqdm(models.items(), desc="Testing Models"):
        print(f"Testing model: {model_name}")

      =
        if model_name == "BioGPT":
            word_embedding_model = Transformer(model_path, max_seq_length=512)
            pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
            model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        else:
            model = SentenceTransformer(model_path).to(device)


        test_case_results = []
        for test_case in tqdm(test_cases, desc=f"Processing Test Cases for {model_name}"):
            evidence_embedding = model.encode([test_case["Evidence"]], convert_to_tensor=True).to(device)
            supports_embedding = model.encode(data["Supports"].tolist(), convert_to_tensor=True).to(device)
            contradicts_embedding = model.encode(data["Contradicts"].tolist(), convert_to_tensor=True).to(device)
            ambiguous_embedding = model.encode(data["Ambiguous"].tolist(), convert_to_tensor=True).to(device)

            # Compute similarity
            supports_similarities = util.pytorch_cos_sim(evidence_embedding, supports_embedding).squeeze().tolist()
            contradicts_similarities = util.pytorch_cos_sim(evidence_embedding, contradicts_embedding).squeeze().tolist()
            ambiguous_similarities = util.pytorch_cos_sim(evidence_embedding, ambiguous_embedding).squeeze().tolist()

            # NDCG calculation
            supports_truth = [1] * len(supports_similarities)
            contradicts_truth = [0] * len(contradicts_similarities)
            ambiguous_truth = [0.5] * len(ambiguous_similarities)

            supports_ndcg = compute_ndcg(np.array(supports_truth), np.array(supports_similarities), k=top_k)
            contradicts_ndcg = compute_ndcg(np.array(contradicts_truth), np.array(contradicts_similarities), k=top_k)
            ambiguous_ndcg = compute_ndcg(np.array(ambiguous_truth), np.array(ambiguous_similarities), k=top_k)

            ndcg_results.append({
                "Model": model_name,
                "Supports NDCG": supports_ndcg,
                "Contradicts NDCG": contradicts_ndcg,
                "Ambiguous NDCG": ambiguous_ndcg
            })

            # Store results
            test_case_results.append({
                "Evidence": test_case["Evidence"],
                "Top K Supports": get_top_k(evidence_embedding, supports_embedding, data["Supports"], top_k),
                "Top K Contradicts": get_top_k(evidence_embedding, contradicts_embedding, data["Contradicts"], top_k),
                "Top K Ambiguous": get_top_k(evidence_embedding, ambiguous_embedding, data["Ambiguous"], top_k)
            })

        top_k_results[model_name] = test_case_results


    ndcg_df = pd.DataFrame(ndcg_results)
    print("\nNDCG Results:")
    print(ndcg_df)
    return test_results, top_k_results, ndcg_df

test_cases = [
   {
        "Evidence": "Aspirin-exacerbated respiratory disease (AERD) is linked to increased leukotriene production due to abnormal arachidonic acid metabolism.",
        "Supports": "AERD management includes avoiding NSAIDs and using leukotriene receptor antagonists.",
        "Contradicts": "AERD is unrelated to leukotriene production and is solely caused by environmental factors.",
        "Ambiguous": "The role of desensitization therapy in managing AERD remains a topic of debate among clinicians."
    },
    {
        "Evidence": "The diagnosis of anaphylaxis relies mainly on clinical evaluation and patient history.",
        "Supports": "Clinical evaluation is critical for timely diagnosis and management of anaphylaxis.",
        "Contradicts": "Anaphylaxis diagnosis cannot be made without laboratory tests like serum tryptase.",
        "Ambiguous": "The importance of laboratory testing in anaphylaxis diagnosis is often debated in the medical community."
    },
    {
        "Evidence": "Asthma is a chronic inflammatory respiratory condition influenced by both genetic and environmental factors.",
        "Supports": "Recent studies have identified genes like IL4 and IL13 as contributors to asthma susceptibility.",
        "Contradicts": "Asthma is purely environmental and does not involve any genetic predisposition.",
        "Ambiguous": "The interplay of genetic and environmental factors in asthma varies across patient populations."
    },
    {
        "Evidence": "Nocturnal asthma is characterized by worsening symptoms at night due to circadian influences and airway inflammation.",
        "Supports": "Management includes inhaled corticosteroids and environmental modifications to improve sleep quality.",
        "Contradicts": "Nocturnal asthma can be treated without pharmacological interventions or environmental changes.",
        "Ambiguous": "The extent to which nocturnal asthma is influenced by circadian rhythms versus external triggers is not fully understood."
    },
    {
        "Evidence": "Exercise-induced anaphylaxis (EIA) is triggered by physical activity, often influenced by co-factors like food intake and environmental conditions.",
        "Supports": "Avoiding known triggers is critical in managing EIA and preventing severe reactions.",
        "Contradicts": "EIA does not require trigger avoidance and is unaffected by dietary or environmental factors.",
        "Ambiguous": "The efficacy of antihistamines in managing EIA varies widely among patients."
    },
    {
        "Evidence": "Precision medicine approaches are essential for tailoring asthma treatments to individual genetic and molecular profiles.",
        "Supports": "Personalized therapies based on genetic factors improve asthma outcomes.",
        "Contradicts": "Standard treatments are sufficient for asthma management, making precision medicine unnecessary.",
        "Ambiguous": "The practicality of implementing precision medicine in asthma care depends on accessibility and cost factors."
    },
    {
        "Evidence": "Management of anaphylaxis includes both immediate treatment with epinephrine and long-term prevention strategies.",
        "Supports": "Patient education on allergen avoidance is a key component of anaphylaxis management.",
        "Contradicts": "Long-term strategies are unnecessary since anaphylaxis management focuses solely on acute treatment.",
        "Ambiguous": "The effectiveness of long-term anaphylaxis prevention strategies varies across different patient populations."
    }
]


test_results_df, top_k_results, ndcg_df = test_model(models_dict, test_cases, top_k=5)


for model, results in top_k_results.items():
    print(f"\nModel: {model}")
    for case in results:
        print(f"\nEvidence: {case['Evidence']}")
        print("Top K Supports:")
        for item in case["Top K Supports"]:
            print(f"  {item['Text']} (Score: {item['Score']:.4f})")
        print("Top K Contradicts:")
        for item in case["Top K Contradicts"]:
            print(f"  {item['Text']} (Score: {item['Score']:.4f})")
        print("Top K Ambiguous:")
        for item in case["Top K Ambiguous"]:
            print(f"  {item['Text']} (Score: {item['Score']:.4f})")

sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(data=ndcg_df.melt(id_vars="Model"), x="Model", y="value", hue="variable")
plt.title("NDCG Scores Across Models")
plt.ylabel("NDCG Score")
plt.xlabel("Model")
plt.legend(title="Similarity Type", loc="upper right")
plt.xticks(rotation=45)
plt.show()


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Define models
models_dict = {
    "PubMedBERT": "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb",
    "SciBERT": "pritamdeka/S-Scibert-snli-multinli-stsb",
    "BioBERT": "pritamdeka/S-BioBert-snli-multinli-stsb",
    "BioGPT": "microsoft/biogpt",
    "BlueBERT": "pritamdeka/S-Bluebert-snli-multinli-stsb"
}

# Function to retrieve top-k sentences
def get_top_k(evidence_embedding, comparison_embeddings, comparison_texts, k=3):
    similarities = util.pytorch_cos_sim(evidence_embedding, comparison_embeddings).squeeze().tolist()
    top_k_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]
    return [{"Text": comparison_texts[i], "Score": similarities[i]} for i in top_k_indices]

# Test Function
def test_model(models, test_cases, top_k=3):
    top_k_results = {}

    for model_name, model_path in tqdm(models.items(), desc="Testing Models"):
        print(f"Testing model: {model_name}")

        # Load model
        model = SentenceTransformer(model_path).to(device)

        test_case_results = []
        for test_case in tqdm(test_cases, desc=f"Processing Test Cases for {model_name}"):
            evidence_embedding = model.encode([test_case["Evidence"]], convert_to_tensor=True).to(device)
            supports_embedding = model.encode(data["Supports"].tolist(), convert_to_tensor=True).to(device)
            contradicts_embedding = model.encode(data["Contradicts"].tolist(), convert_to_tensor=True).to(device)
            ambiguous_embedding = model.encode(data["Ambiguous"].tolist(), convert_to_tensor=True).to(device)

            # Store results
            test_case_results.append({
                "Evidence": test_case["Evidence"],
                "Top K Supports": get_top_k(evidence_embedding, supports_embedding, data["Supports"], top_k),
                "Top K Contradicts": get_top_k(evidence_embedding, contradicts_embedding, data["Contradicts"], top_k),
                "Top K Ambiguous": get_top_k(evidence_embedding, ambiguous_embedding, data["Ambiguous"], top_k)
            })

        top_k_results[model_name] = test_case_results

    return top_k_results

# Test cases
test_cases = [
    {
        "Evidence": "Aspirin-exacerbated respiratory disease (AERD) is linked to increased leukotriene production due to abnormal arachidonic acid metabolism.",
    },
    {
        "Evidence": "The diagnosis of anaphylaxis relies mainly on clinical evaluation and patient history.",
    },
    {
        "Evidence": "Asthma is a chronic inflammatory respiratory condition influenced by both genetic and environmental factors.",
    },
    {
        "Evidence": "Nocturnal asthma is characterized by worsening symptoms at night due to circadian influences and airway inflammation.",
    },
    {
        "Evidence": "Exercise-induced anaphylaxis (EIA) is triggered by physical activity, often influenced by co-factors like food intake and environmental conditions.",
    },
    {
        "Evidence": "Precision medicine approaches are essential for tailoring asthma treatments to individual genetic and molecular profiles.",
    },
    {
        "Evidence": "Management of anaphylaxis includes both immediate treatment with epinephrine and long-term prevention strategies.",
    }
]

# Run tests
top_k_results = test_model(models_dict, test_cases, top_k=5)

# Display results
for model, results in top_k_results.items():
    print(f"\n=== Model: {model} ===")
    for case in results:
        print(f"\nEvidence: {case['Evidence']}")
        print("\nTop K Supports:")
        for idx, item in enumerate(case["Top K Supports"], start=1):
            print(f"  {idx}. {item['Text']} (Score: {item['Score']:.4f})")
        print("\nTop K Contradicts:")
        for idx, item in enumerate(case["Top K Contradicts"], start=1):
            print(f"  {idx}. {item['Text']} (Score: {item['Score']:.4f})")
        print("\nTop K Ambiguous:")
        for idx, item in enumerate(case["Top K Ambiguous"], start=1):
            print(f"  {idx}. {item['Text']} (Score: {item['Score']:.4f})")

# Visualization of average similarity scores
avg_scores = []

for model, results in top_k_results.items():
    supports_scores = []
    contradicts_scores = []
    ambiguous_scores = []

    for case in results:
        supports_scores.extend([item["Score"] for item in case["Top K Supports"]])
        contradicts_scores.extend([item["Score"] for item in case["Top K Contradicts"]])
        ambiguous_scores.extend([item["Score"] for item in case["Top K Ambiguous"]])

    avg_scores.append({
        "Model": model,
        "Average Supports Score": np.mean(supports_scores),
        "Average Contradicts Score": np.mean(contradicts_scores),
        "Average Ambiguous Score": np.mean(ambiguous_scores)
    })

avg_scores_df = pd.DataFrame(avg_scores)
melted_avg_scores_df = avg_scores_df.melt(id_vars="Model", var_name="Category", value_name="Average Score")

plt.figure(figsize=(12, 6))
sns.barplot(data=melted_avg_scores_df, x="Model", y="Average Score", hue="Category")
plt.title("Average Similarity Scores Across Models and Categories", fontsize=16)
plt.xlabel("Model", fontsize=14)
plt.ylabel("Average Score", fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.legend(title="Category", fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Define models
models_dict = {
    "PubMedBERT": "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb",
    "SciBERT": "pritamdeka/S-Scibert-snli-multinli-stsb",
    "BioBERT": "pritamdeka/S-BioBert-snli-multinli-stsb",
    "BioGPT": "microsoft/biogpt",
    "BlueBERT": "pritamdeka/S-Bluebert-snli-multinli-stsb"
}

# Function to retrieve top-k sentences
def get_top_k(evidence_embedding, comparison_embeddings, comparison_texts, k=3):
    similarities = util.pytorch_cos_sim(evidence_embedding, comparison_embeddings).squeeze().tolist()
    top_k_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]
    return [{"Text": comparison_texts[i], "Score": similarities[i]} for i in top_k_indices]

# Classify and retrieve top-k claims
def classify_and_retrieve(model, evidence, top_k=3):
    # Encode the evidence
    evidence_embedding = model.encode([evidence], convert_to_tensor=True).to(device)

    # Encode categories
    supports_embedding = model.encode(data["Supports"].tolist(), convert_to_tensor=True).to(device)
    contradicts_embedding = model.encode(data["Contradicts"].tolist(), convert_to_tensor=True).to(device)
    ambiguous_embedding = model.encode(data["Ambiguous"].tolist(), convert_to_tensor=True).to(device)

    # Calculate average similarity for classification
    supports_similarity = util.pytorch_cos_sim(evidence_embedding, supports_embedding).mean().item()
    contradicts_similarity = util.pytorch_cos_sim(evidence_embedding, contradicts_embedding).mean().item()
    ambiguous_similarity = util.pytorch_cos_sim(evidence_embedding, ambiguous_embedding).mean().item()

    # Determine the category with the highest similarity
    category_scores = {
        "Supports": supports_similarity,
        "Contradicts": contradicts_similarity,
        "Ambiguous": ambiguous_similarity
    }
    predicted_category = max(category_scores, key=category_scores.get)

    # Retrieve top-k claims from the predicted category
    if predicted_category == "Supports":
        top_k_claims = get_top_k(evidence_embedding, supports_embedding, data["Supports"], top_k)
    elif predicted_category == "Contradicts":
        top_k_claims = get_top_k(evidence_embedding, contradicts_embedding, data["Contradicts"], top_k)
    else:
        top_k_claims = get_top_k(evidence_embedding, ambiguous_embedding, data["Ambiguous"], top_k)

    return predicted_category, top_k_claims

# Example claims
claims = [
    "Aspirin-exacerbated respiratory disease (AERD) is linked to increased leukotriene production due to abnormal arachidonic acid metabolism.",
    "The diagnosis of anaphylaxis relies mainly on clinical evaluation and patient history.",
    "Asthma is a chronic inflammatory respiratory condition influenced by both genetic and environmental factors.",
    "Nocturnal asthma is characterized by worsening symptoms at night due to circadian influences and airway inflammation.",
    "Exercise-induced anaphylaxis (EIA) is triggered by physical activity, often influenced by co-factors like food intake and environmental conditions.",
    "Precision medicine approaches are essential for tailoring asthma treatments to individual genetic and molecular profiles.",
    "Management of anaphylaxis includes both immediate treatment with epinephrine and long-term prevention strategies.",
    "Diabetes management is crucial for preventing complications like neuropathy and cardiovascular diseases.",
    "Patient exhibits symptoms of chronic hypertension..",
    "Hypertension management requires both lifestyle modifications and pharmacological interventions."
]

# Evaluate all models
results = {}

for model_name, model_path in tqdm(models_dict.items(), desc="Evaluating Models"):
    print(f"\nProcessing model: {model_name}...")
    model = SentenceTransformer(model_path).to(device)

    model_results = []
    for claim in claims:
        predicted_category, top_k_claims = classify_and_retrieve(model, claim, top_k=5)
        model_results.append({
            "Claim": claim,
            "Predicted Category": predicted_category,
            "Top K Claims": top_k_claims
        })

    results[model_name] = model_results

# Display results
for model_name, model_results in results.items():
    print(f"\n=== Results for {model_name} ===")
    for result in model_results:
        print(f"\nClaim: {result['Claim']}")
        print(f"Predicted Category: {result['Predicted Category']}")
        print("Top K Claims:")
        for idx, claim in enumerate(result["Top K Claims"], start=1):
            print(f"  {idx}. {claim['Text']} (Score: {claim['Score']:.4f})")


## **Current Pipeline**

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Models
bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"

bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)

# Load fine-tuned CrossEncoder
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)

# Custom Maximal Marginal Relevance (MMR) implementation
def mmr(claim_embedding, evidence_embeddings, similarity_scores, top_k=5, diversity=0.7):
    selected = []
    candidates = list(range(len(evidence_embeddings)))

    for _ in range(top_k):
        if not candidates:
            break
        # Compute MMR score for all candidates
        mmr_scores = [
            (1 - diversity) * similarity_scores[i].item() -
            diversity * (
                max(util.pytorch_cos_sim(evidence_embeddings[i], evidence_embeddings[selected]).max().item() for selected_idx in selected)
                if selected else 0  # No diversity penalty if no items are selected yet
            )
            for i in candidates
        ]
        # Select the candidate with the highest MMR score
        best_candidate = candidates[mmr_scores.index(max(mmr_scores))]
        selected.append(best_candidate)
        candidates.remove(best_candidate)

    return selected


# Remove duplicate evidence
def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

# Filter highly similar evidence
def filter_similar_evidence(evidence_pool, bi_encoder, threshold=0.9):
    embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    unique_evidences = []
    seen = torch.zeros(len(evidence_pool)).to(device)

    for i, emb in enumerate(embeddings):
        if seen[i]:  # Skip if already marked as duplicate
            continue
        unique_evidences.append(evidence_pool[i])
        similarities = util.pytorch_cos_sim(emb, embeddings).squeeze()
        seen += (similarities > threshold).int()  # Mark similar items as seen

    return unique_evidences

# Balance the evidence pool
def balance_evidence_pool(data):
    supports = data["Supports"].dropna().tolist()
    contradicts = data["Contradicts"].dropna().tolist()
    ambiguous = data["Ambiguous"].dropna().tolist()

    # Sample equally
    min_samples = min(len(supports), len(contradicts), len(ambiguous))
    return supports[:min_samples] + contradicts[:min_samples] + ambiguous[:min_samples]

# Retrieve diverse top-k evidences
def retrieve_top_evidences_with_mmr(claim, evidence_pool, bi_encoder, top_k=5, diversity=0.7):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    # Use MMR for diversity
    selected_indices = mmr(claim_embedding, evidence_embeddings, similarities, top_k=top_k, diversity=diversity)
    top_k_evidences = [evidence_pool[i] for i in selected_indices]
    top_k_scores = [similarities[i].item() for i in selected_indices]

    return top_k_evidences, top_k_scores

# Classify claim based on evidence
def classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict):
    pairs = [[claim, ev] for ev in top_evidences]
    inputs = tokenizer(
        [pair[0] for pair in pairs],
        [pair[1] for pair in pairs],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    scores = torch.softmax(cross_encoder(**inputs).logits, dim=1).cpu().detach().numpy()
    max_score_idx = scores.argmax(axis=0)[1]  # Find the index of the highest "Supports" score
    best_evidence = top_evidences[max_score_idx]

    # Classify based on the column the evidence belongs to
    for label, evidence_list in evidence_dict.items():
        if best_evidence in evidence_list:
            return label, best_evidence
    return "Unknown", best_evidence

# Test function
def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    # Preprocess evidence pool
    evidence_pool = balance_evidence_pool(data)
    evidence_pool = remove_duplicate_evidence(evidence_pool)
    evidence_pool = filter_similar_evidence(evidence_pool, bi_encoder)

    # Prepare evidence dictionary
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    # Process test cases
    results = []
    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]

        # Retrieve top evidences
        top_evidences, scores = retrieve_top_evidences_with_mmr(claim, evidence_pool, bi_encoder, top_k)

        # Classify claim
        classification, best_evidence = classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Classification": classification,
            "Best Evidence": best_evidence,
        })

    return results

# Test cases
test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants."},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

# Run test
results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)

# Display results
for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Classification: {result['Classification']}")
    print(f"Best Evidence: {result['Best Evidence']}")


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Models
bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"

# Load bi-encoder
bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)

# Load fine-tuned CrossEncoder
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)

# Remove duplicate evidence
def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

# Retrieve top evidences
def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    # Rank evidence by similarity
    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores

# Classify claim based on evidence
def classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict):
    pairs = [[claim, ev] for ev in top_evidences]
    inputs = tokenizer(
        [pair[0] for pair in pairs],
        [pair[1] for pair in pairs],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()
    max_score_idx = scores.argmax(axis=0)[1]  # Highest score for "Supports"
    best_evidence = top_evidences[max_score_idx]

    # Classify based on the column the evidence belongs to
    for label, evidence_list in evidence_dict.items():
        if best_evidence in evidence_list:
            return label, best_evidence
    return "Unknown", best_evidence

# Test function
def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []

    # Prepare the evidence pool and ensure no duplicates
    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]

        # Retrieve top evidences
        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)

        # Classify claim
        classification, best_evidence = classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Classification": classification,
            "Best Evidence": best_evidence,
        })

    return results

# Test cases
test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants."},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

# Run test
results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)

# Display results
for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Classification: {result['Classification']}")
    print(f"Best Evidence: {result['Best Evidence']}")


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Models
bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"

# Load models
bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)

# Remove duplicate evidence
def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

# Retrieve top evidences
def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    # Rank evidence by similarity
    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores

# Classify claim based on evidence
def classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict):
    pairs = [[claim, ev] for ev in top_evidences]
    inputs = tokenizer(
        [pair[0] for pair in pairs],
        [pair[1] for pair in pairs],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()

    # Find the best evidence and its corresponding classification
    max_score_idx = scores.argmax(axis=0)[1]
    best_evidence = top_evidences[max_score_idx]
    classification_scores = scores[max_score_idx]

    # Map classifications to labels
    label_map = {0: "Supports", 1: "Contradicts", 2: "Ambiguous"}
    classification = label_map[classification_scores.argmax()]

    return classification, best_evidence

# Test function
def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []

    # Prepare the evidence pool and ensure no duplicates
    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]

        # Retrieve top evidences
        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)

        # Classify claim
        classification, best_evidence = classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Classification": classification,
            "Best Evidence": best_evidence,
        })

    return results

# Test cases
test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants."},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

# Run test
results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)

# Display results
for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Classification: {result['Classification']}")
    print(f"Best Evidence: {result['Best Evidence']}")


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Models
bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"

# Load models
bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)

# Remove duplicate evidence
def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

# Retrieve top evidences
def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    # Rank evidence by similarity
    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores

# Classify claim based on evidence and match with data labels
def classify_claim(claim, top_evidences, cross_encoder, tokenizer, evidence_dict):
    pairs = [[claim, ev] for ev in top_evidences]
    inputs = tokenizer(
        [pair[0] for pair in pairs],
        [pair[1] for pair in pairs],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()

    # Find the best evidence and its corresponding classification
    max_score_idx = scores.argmax(axis=0)[1]
    best_evidence = top_evidences[max_score_idx]
    classification_scores = scores[max_score_idx]

    # Semantic classification
    label_map = {0: "Supports", 1: "Contradicts", 2: "Ambiguous"}
    predicted_label = label_map[classification_scores.argmax()]

    # Match with dataset labels
    for label, evidence_list in evidence_dict.items():
        if best_evidence in evidence_list:
            matched_label = label
            return matched_label, best_evidence, predicted_label

    return "Unknown", best_evidence, predicted_label

# Test function
def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []

    # Prepare the evidence pool and ensure no duplicates
    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]

        # Retrieve top evidences
        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)

        # Classify claim
        matched_label, best_evidence, predicted_label = classify_claim(
            claim, top_evidences, cross_encoder, tokenizer, evidence_dict
        )

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Matched Label": matched_label,
            "Predicted Label": predicted_label,
            "Best Evidence": best_evidence,
        })

    return results

# Test cases
test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants."},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

# Run test
results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)

# Display results
for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Matched Label: {result['Matched Label']}")
    print(f"Predicted Label: {result['Predicted Label']}")
    print(f"Best Evidence: {result['Best Evidence']}")


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"


csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")


bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"


bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)


def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()


    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores

def classify_claim_semantics(claim, best_evidence, cross_encoder, tokenizer):
    inputs = tokenizer(
        [claim],
        [best_evidence],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()

    label_map = {0: "Supports", 1: "Contradicts", 2: "Ambiguous"}
    classification = label_map[scores.argmax()]
    return classification


def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []


    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]


        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)


        best_evidence = top_evidences[0]


        classification = classify_claim_semantics(claim, best_evidence, cross_encoder, tokenizer)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Best Evidence": best_evidence,
            "Classification": classification,
        })

    return results


test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Antiretroviral therapies do not need to be monitored closely in patients with liver disease associated with HIV infection."},
    {"Claim": "The overall safety of live attenuated vaccines for immunocompromised patients is still a topic of ongoing research."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]


results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)


for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Best Evidence: {result['Best Evidence']}")
    print(f"Classification: {result['Classification']}")


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import classification_report


device = "cuda" if torch.cuda.is_available() else "cpu"


csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")


bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"

bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)


def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))


def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores


def classify_claim_semantics(claim, best_evidence, cross_encoder, tokenizer, confidence_threshold=0.5):
    inputs = tokenizer(
        [claim],
        [best_evidence],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()


    label_map = {0: "Supports", 1: "Contradicts", 2: "Ambiguous"}
    max_score_idx = scores.argmax()
    max_score = scores[0][max_score_idx]


    classification = label_map[max_score_idx] if max_score >= confidence_threshold else "Ambiguous"

    return classification, max_score


def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []
    true_labels = []
    predicted_labels = []


    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]
        true_label = test_case.get("True Label", "Unknown")

        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)


        best_evidence = top_evidences[0]

        predicted_label, confidence = classify_claim_semantics(claim, best_evidence, cross_encoder, tokenizer)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Best Evidence": best_evidence,
            "True Label": true_label,
            "Predicted Label": predicted_label,
            "Confidence": confidence,
        })

        if true_label != "Unknown":
            true_labels.append(true_label)
            predicted_labels.append(predicted_label)


    if true_labels:
        print("\nClassification Report:")
        print(classification_report(
            true_labels,
            predicted_labels,
            labels=["Supports", "Contradicts", "Ambiguous"],
            target_names=["Supports", "Contradicts", "Ambiguous"]
        ))

    return results


test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available.", "True Label": "Supports"},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants.", "True Label": "Supports"},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods.", "True Label": "Ambiguous"},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions.", "True Label": "Supports"},
]

results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)


for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Best Evidence: {result['Best Evidence']}")
    print(f"Predicted Label: {result['Predicted Label']} (Confidence: {result['Confidence']:.4f})")
    print(f"True Label: {result['True Label']}")


Latest Code (to use)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch
from sklearn.metrics import classification_report

device = "cuda" if torch.cuda.is_available() else "cpu"


csv_file_path = "/content/drive/MyDrive/CS 5787 - Deep Learning/Project - Deep Learning/GitHub/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")


bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_path = "/content/drive/MyDrive/fine_tuned_cross_encoder"


bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_path)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_model_path).to(device)


def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))


def retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k=5):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()


    top_k_indices = torch.topk(similarities, k=min(top_k, len(evidence_pool))).indices.cpu().tolist()
    top_k_evidences = [evidence_pool[i] for i in top_k_indices]
    top_k_scores = [similarities[i].item() for i in top_k_indices]

    return top_k_evidences, top_k_scores

def classify_claim_with_threshold(claim, best_evidence, cross_encoder, tokenizer, similarity):
    # Similarity-based classification
    if similarity >= 0.6:
        similarity_based_class = "Supports"
    elif similarity >= 0.3:
        similarity_based_class = "Ambiguous"
    else:
        similarity_based_class = "Contradicts"

    # Use CrossEncoder for further refinement
    inputs = tokenizer(
        [claim],
        [best_evidence],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    logits = cross_encoder(**inputs).logits
    scores = torch.softmax(logits, dim=1).cpu().detach().numpy()


    label_map = {0: "Supports", 1: "Contradicts", 2: "Ambiguous"}
    cross_encoder_class = label_map[scores.argmax()]

    # Final classification: Use similarity-based unless CrossEncoder predicts "Ambiguous"
    if cross_encoder_class == "Ambiguous":
        return "Ambiguous", scores.max()
    return similarity_based_class, scores.max()


def test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer, top_k=5):
    results = []
    true_labels = []
    predicted_labels = []


    evidence_pool = remove_duplicate_evidence(data["Evidence"].dropna().tolist())

    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]
        true_label = test_case.get("True Label", "Unknown")


        top_evidences, scores = retrieve_top_evidences(claim, evidence_pool, bi_encoder, top_k)

        best_evidence = top_evidences[0]
        best_similarity = scores[0]


        predicted_label, confidence = classify_claim_with_threshold(
            claim, best_evidence, cross_encoder, tokenizer, best_similarity
        )

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Best Evidence": best_evidence,
            "True Label": true_label,
            "Predicted Label": predicted_label,
            "Confidence": confidence,
            "Similarity": best_similarity,
        })

        if true_label != "Unknown":
            true_labels.append(true_label)
            predicted_labels.append(predicted_label)


    if true_labels:
        print("\nClassification Report:")
        print(classification_report(
            true_labels,
            predicted_labels,
            labels=["Supports", "Contradicts", "Ambiguous"],
            target_names=["Supports", "Contradicts", "Ambiguous"]
        ))

    return results


test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Antiretroviral therapies do not need to be monitored closely in patients with liver disease associated with HIV infection."},
    {"Claim": "The overall safety of live attenuated vaccines for immunocompromised patients is still a topic of ongoing research."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

results = test_claims(test_cases, data, bi_encoder, cross_encoder, tokenizer)


for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Best Evidence: {result['Best Evidence']}")
    print(f"Predicted Label: {result['Predicted Label']} (Confidence: {result['Confidence']:.4f})")
    print(f"Similarity: {result['Similarity']:.4f}")
    print(f"True Label: {result['True Label']}")


Training the cross encoder

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm import tqdm
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CSV
csv_file_path = "/content/drive/MyDrive/generated_claim_triplets_with_topics.csv"
data = pd.read_csv(csv_file_path)

required_columns = {"Evidence", "Supports", "Contradicts", "Ambiguous"}
if not required_columns.issubset(data.columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Models
bi_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"
cross_encoder_model_name = "pritamdeka/PubMedBERT-mnli-snli-scinli-stsb"

bi_encoder = SentenceTransformer(bi_encoder_model_name).to(device)

def prepare_training_data(data):
    train_data = []
    for index, row in data.iterrows():
        evidence_list = [
            {"evidence": row["Supports"], "label": 0},  # Supports
            {"evidence": row["Contradicts"], "label": 1},  # Contradicts
            {"evidence": row["Ambiguous"], "label": 2}   # Ambiguous
        ]
        for evidence_item in evidence_list:
            train_data.append({
                "claim": row["Evidence"],
                "evidence": evidence_item["evidence"],
                "label": evidence_item["label"]
            })
    return train_data


def tokenize_data(train_data, tokenizer):
    train_texts = [(d["claim"], d["evidence"]) for d in train_data]
    train_labels = [d["label"] for d in train_data]

    encodings = tokenizer(
        [text[0] for text in train_texts],
        [text[1] for text in train_texts],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    return encodings, train_labels

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Train the CrossEncoder
def train_cross_encoder(data):
    # Prepare the dataset
    train_data = prepare_training_data(data)
    tokenizer = AutoTokenizer.from_pretrained(cross_encoder_model_name)
    encodings, labels = tokenize_data(train_data, tokenizer)
    dataset = CustomDataset(encodings, labels)

    # Split into training and evaluation datasets
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        cross_encoder_model_name,
        num_labels=3  # Supports, Contradicts, Ambiguous
    ).to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",  # Enable evaluation during training
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir="./logs",
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,  # Include evaluation dataset
    )

    # Train the model
    trainer.train()

    # Save the model
    model.save_pretrained("./fine_tuned_cross_encoder")
    tokenizer.save_pretrained("./fine_tuned_cross_encoder")
    return model

# Train the CrossEncoder
cross_encoder = train_cross_encoder(data)

# Remove duplicate evidence
def remove_duplicate_evidence(evidence_pool):
    return list(set(evidence_pool))

# Filter highly similar evidence
def filter_similar_evidence(evidence_pool, bi_encoder, threshold=0.9):
    embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)
    unique_evidences = []
    seen = torch.zeros(len(evidence_pool)).to(device)

    for i, emb in enumerate(embeddings):
        if seen[i]:  # Skip if already marked as duplicate
            continue
        unique_evidences.append(evidence_pool[i])
        similarities = util.pytorch_cos_sim(emb, embeddings).squeeze()
        seen += (similarities > threshold).int()  # Mark similar items as seen

    return unique_evidences

# Balance the evidence pool
def balance_evidence_pool(data):
    supports = data["Supports"].dropna().tolist()
    contradicts = data["Contradicts"].dropna().tolist()
    ambiguous = data["Ambiguous"].dropna().tolist()

    # Sample equally
    min_samples = min(len(supports), len(contradicts), len(ambiguous))
    return supports[:min_samples] + contradicts[:min_samples] + ambiguous[:min_samples]

# Retrieve diverse top-k evidences
def retrieve_top_evidences_with_mmr(claim, evidence_pool, bi_encoder, top_k=5, diversity=0.7):
    claim_embedding = bi_encoder.encode([claim], convert_to_tensor=True).to(device)
    evidence_embeddings = bi_encoder.encode(evidence_pool, convert_to_tensor=True).to(device)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(claim_embedding, evidence_embeddings).squeeze()

    # Use MMR for diversity
    selected_indices = util.mmr(claim_embedding, evidence_embeddings, similarities, top_k=top_k, diversity=diversity)
    top_k_evidences = [evidence_pool[i] for i in selected_indices]
    top_k_scores = [similarities[i].item() for i in selected_indices]

    return top_k_evidences, top_k_scores

# Classify claim based on evidence
def classify_claim(claim, top_evidences, cross_encoder, evidence_dict):
    pairs = [[claim, ev] for ev in top_evidences]
    scores = cross_encoder.predict(pairs)
    max_score_idx = scores.argmax()
    best_evidence = top_evidences[max_score_idx]

    # Classify based on the column the evidence belongs to
    for label, evidence_list in evidence_dict.items():
        if best_evidence in evidence_list:
            return label, best_evidence
    return "Unknown", best_evidence

# Test function
def test_claims(test_cases, data, bi_encoder, cross_encoder, top_k=5):
    # Preprocess evidence pool
    evidence_pool = balance_evidence_pool(data)
    evidence_pool = remove_duplicate_evidence(evidence_pool)
    evidence_pool = filter_similar_evidence(evidence_pool, bi_encoder)

    # Prepare evidence dictionary
    evidence_dict = {
        "Supports": data["Supports"].dropna().tolist(),
        "Contradicts": data["Contradicts"].dropna().tolist(),
        "Ambiguous": data["Ambiguous"].dropna().tolist(),
    }

    # Process test cases
    results = []
    for test_case in tqdm(test_cases, desc="Processing Claims"):
        claim = test_case["Claim"]

        # Retrieve top evidences
        top_evidences, scores = retrieve_top_evidences_with_mmr(claim, evidence_pool, bi_encoder, top_k)

        # Classify claim
        classification, best_evidence = classify_claim(claim, top_evidences, cross_encoder, evidence_dict)

        results.append({
            "Claim": claim,
            "Top Evidences": top_evidences,
            "Scores": scores,
            "Classification": classification,
            "Best Evidence": best_evidence,
        })

    return results

# Test cases
test_cases = [
    {"Claim": "Diagnosis of anaphylaxis mainly depends on clinical presentation and the patient's allergy history, as there are no specific tests available."},
    {"Claim": "Careful management of asthma medications is crucial to prevent adverse outcomes for both mothers and infants."},
    {"Claim": "Dietary management strategies for PFAS might be effective, but their success can depend on individual reactions to specific foods."},
    {"Claim": "Prompt administration of epinephrine is essential for managing severe allergic reactions."},
]

# Run test
results = test_claims(test_cases, data, bi_encoder, cross_encoder)

# Display results
for result in results:
    print(f"\nClaim: {result['Claim']}")
    print("Top Evidences:")
    for idx, (evidence, score) in enumerate(zip(result["Top Evidences"], result["Scores"]), start=1):
        print(f"  {idx}. {evidence} (Score: {score:.4f})")
    print(f"Classification: {result['Classification']}")
    print(f"Best Evidence: {result['Best Evidence']}")
