In [2]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import traceback
from collections import namedtuple
import pandas as pd
import openai
import torch

from transformers import AutoTokenizer, AutoModel

# Choose either BioBERT or SciBERT
model_name = "dmis-lab/biobert-base-cased-v1.1"  # BioBERT
# model_name = "allenai/scibert_scivocab_uncased"  # SciBERT

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def encode_texts(subj, rel, obj, tokenizer):
    # Concatenate subject, relation, and object into one input
    input_text = f"{subj} [SEP] {rel} [SEP] {obj}"
    encoded = tokenizer(
        input_text,
        padding='max_length',
        truncation=True,
        max_length=176,  # Adjust max_length based on your task
        return_tensors="pt"
    )
    return encoded
def get_embeddings(encoded_input, model):
    with torch.no_grad():  # No gradient computation
        outputs = model(**encoded_input)
        # Use the [CLS] token's representation as the embedding
        embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return embeddings

from torch.nn.functional import cosine_similarity

def compute_similarity(triple1, triple2, tokenizer, model):
    # Encode triples
    encoded1 = encode_texts(triple1['subj'], triple1['rel'], triple1['obj'], tokenizer)
    encoded2 = encode_texts(triple2['subj'], triple2['rel'], triple2['obj'], tokenizer)

    # Generate embeddings
    emb1 = get_embeddings(encoded1, model)
    emb2 = get_embeddings(encoded2, model)

    # Compute cosine similarity
    similarity = cosine_similarity(emb1, emb2).item()
    return similarity



def create_row(df, idx):
    """Create a formatted string representation of a row."""
    row = df.iloc[idx]
    subj = str(row['subj']).strip().lower()
    rel = str(row['rel']).strip().lower()
    obj = str(row['obj']).strip().lower()
    return {'subj': subj, 'rel': rel, 'obj': obj}

def semantic_similarity_score_biobert(triple1, triple2, tokenizer, model):
    """Compute similarity using BioBERT/SciBERT embeddings."""
    similarity = compute_similarity(triple1, triple2, tokenizer, model)
    reason = f"Cosine similarity: {similarity:.2f}"
    return similarity, reason
    # Calculate component scores with weights
    subj_score = term_similarity(text1_dict['subj'], text2_dict['subj'], weight=0.4)
    rel_score = term_similarity(text1_dict['rel'], text2_dict['rel'], weight=0.3)
    obj_score = term_similarity(text1_dict['obj'], text2_dict['obj'], weight=0.3)
    
    total_score = subj_score + rel_score + obj_score
    
    # Generate detailed reason
    reasons = []
    if subj_score > 0:
        reasons.append(f"Subject match ({subj_score:.2f})")
    if rel_score > 0:
        reasons.append(f"Relation match ({rel_score:.2f})")
    if obj_score > 0:
        reasons.append(f"Object match ({obj_score:.2f})")
    
    reason = " + ".join(reasons) if reasons else "No significant matches"
    
    return total_score, reason

def evaluate_matches_with_accuracy(ground_truth, llm_output, tokenizer, model, similarity_threshold=0.5):
    """Evaluate matches between ground truth and LLM output with accuracy calculation."""
    best_matches = []
    correct_matches = 0  # Count of matches above the similarity threshold
    
    for i in range(len(ground_truth)):
        gt_row = create_row(ground_truth, i)
        best_evaluation = {'score': 0, 'reason': 'No matches found', 'match_idx': -1}
        
        for j in range(len(llm_output)):
            llm_row = create_row(llm_output, j)
            score, reason = semantic_similarity_score_biobert(gt_row, llm_row, tokenizer, model)
            
            if score > best_evaluation['score']:
                best_evaluation = {'score': score, 'reason': reason}
        
        # Check if the best score exceeds the similarity threshold
        if best_evaluation['score'] >= similarity_threshold:
            correct_matches += 1
        
        # Format results
        gt_string = f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}"
        llm_string = f"{create_row(llm_output, j)['subj']} {create_row(llm_output, j)['rel']} {create_row(llm_output, j)['obj']}" if best_evaluation['score'] >= similarity_threshold else "No match found"
        
        match_result = {
            'ground_truth': gt_string,
            'best_llm_output': llm_string,
            'best_evaluation_score': best_evaluation['score'],
            'reason': best_evaluation['reason']
        }
        best_matches.append(match_result)
        
        # Print progress
        if i % 10 == 0:
            print(f"Processed {i} entries...")

    # Calculate accuracy
    total_rows = len(ground_truth)
    accuracy = correct_matches / total_rows
    
    # Display results
    print("\nEvaluation Results:")
    print(f"Total Rows: {total_rows}")
    print(f"Correct Matches (Score >= {similarity_threshold}): {correct_matches}")
    print(f"Accuracy: {accuracy:.3f}")
    
    # Save results to a DataFrame
    result_df = pd.DataFrame(best_matches)
    return result_df, accuracy

def main():
    try:
        # Load your data
        ground_truth = pd.read_csv("my_annotations.csv")
        llm_output = pd.read_csv("my_temp.csv")
        
        # Remove any unnamed columns
        if 'Unnamed: 0' in llm_output.columns:
            llm_output = llm_output.drop(columns=['Unnamed: 0'])
            
        # Run evaluation
        similarity_threshold = 0.7  # Define your threshold
        results_df, accuracy = evaluate_matches_with_accuracy(ground_truth, 
            llm_output, 
            tokenizer, 
            model, 
            similarity_threshold
        )
        # Save results
        results_df.to_csv("semantic_accuracy_evaluation_results.csv", index=False)
        print("\nResults saved to semantic_accuracy_evaluation_results.csv")
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        print(traceback.format_exc())

if __name__ == "__main__":
    main()


Processed 0 entries...
Processed 10 entries...
Processed 20 entries...
Processed 30 entries...
Processed 40 entries...

Evaluation Results:
Total Rows: 50
Correct Matches (Score >= 0.7): 50
Accuracy: 1.000

Results saved to semantic_accuracy_evaluation_results.csv
