In [1]:
import requests

In [2]:
HUGGINGFACE_API_KEY = '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'

HUGGINGFACE_TASKS = {
    'semantic_similarity': 'https://qv0ojt0iijtajfql.us-east-1.aws.endpoints.huggingface.cloud'
}


def protected_hf_endpoint(api_url, payload):
    headers = {
        "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
        "Content-Type": "application/json"
    }

    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()


def run_hf_task(task_type, payload):
    api_url = HUGGINGFACE_TASKS.get(task_type)
    if not api_url:
        raise ValueError(f"Invalid task type: {task_type}")
    return protected_hf_endpoint(api_url, payload)


def semantic_similarity(source_text, texts):
    if type(texts) == str:
        texts = [texts]
    payload = {
        "inputs": {
            "source_sentence": source_text,
            "sentences": texts
        },
    }
    response = run_hf_task('semantic_similarity', payload).get('similarities')
    if response:
        return response if len(response) > 1 else response[0]


In [3]:
print(semantic_similarity('I am a happy person', ['I am a happy human']))
print(semantic_similarity('I am a happy person', ['I am a happy human', 'That is a happy dog']))

0.8614885807037354
[0.8614886403083801, 0.5496235489845276]


In [42]:
from nltk.translate.bleu_score import sentence_bleu


def calculate_bleu(reference, candidate):  
    # Tokenize the sentences
    reference = reference.split()
    candidate = candidate.split()
    # BLEU expects a list of reference sentences
    return sentence_bleu([reference], candidate)


def exact_match(text1, text2):
    return text1 == text2

def rouge_n(reference, generated, n=1):
    # Placeholder function for ROUGE-N
    # In practice, consider using a library like 'rouge' for comprehensive calculations
    # This is a very basic and naive implementation
    ref_ngrams = set([reference[i:i+n] for i in range(len(reference)-n+1)])
    gen_ngrams = set([generated[i:i+n] for i in range(len(generated)-n+1)])
    overlap = len(ref_ngrams.intersection(gen_ngrams))
    return overlap / len(ref_ngrams)


def word_error_rate(reference, generated):
    # A basic implementation of WER
    ref_words = reference.split()
    gen_words = generated.split()
    edits = edit_distance(ref_words, gen_words)
    return edits / len(ref_words)

def edit_distance(s1, s2):
    # Compute the edit distance between two lists
    if len(s1) == 0:
        return len(s2)
    if len(s2) == 0:
        return len(s1)
    
    if s1[-1] == s2[-1]:
        cost = 0
    else:
        cost = 1

    return min([edit_distance(s1[:-1], s2) + 1,
                edit_distance(s2[:-1], s1) + 1,
                edit_distance(s1[:-1], s2[:-1]) + cost])


In [62]:
# Note that bleu is not a great metric for short sequences. This is just a demonstration
    
calculate_bleu('hi', 'hi')

1.821831989445342e-231

In [63]:
def test_text_similarity(test_cases):
    
    for text1, text2, expected_exact, expected_similarity, expected_rouge, expected_wer, expected_bleu in test_cases:
        # Check exact match
        assert exact_match(text1, text2) == expected_exact, f"Failed for exact match test: {text1} vs {text2}"

        # Check semantic similarity
        similarity_score = semantic_similarity(text1, text2)
        assert abs(similarity_score - expected_similarity) <= 0.1, f"Failed for semantic similarity test: {text1} vs {text2}. Expected {expected_similarity}, got {similarity_score}"

        # Check ROUGE-N
        rouge_score = rouge_n(text1, text2, 1)
        assert abs(rouge_score - expected_rouge) <= 0.1, f"Failed for ROUGE-N test: {text1} vs {text2}. Expected {expected_rouge}, got {rouge_score}"

        # Check WER
        wer = word_error_rate(text1, text2)
        assert abs(wer - expected_wer) <= 0.1, f"Failed for WER test: {text1} vs {text2}. Expected {expected_wer}, got {wer}"
        
        # Check BLEU score
        bleu_score = calculate_bleu(text1, text2)
        assert abs(bleu_score - expected_bleu) <= 0.1, f"Failed for BLEU test: {text1} vs {text2}. Expected {expected_bleu}, got {bleu_score}"


    print("All tests passed!")


In [None]:
test_text_similarity(
    test_cases = [
            ("I love dogs", "I love dogs",    True,  1,   0.9, 0,  0),
            ("I love dogs", "I love canines", False, 0.9, 0.7, .34, 0)
        ]
)

In [None]:
test_text_similarity(
    test_cases = [
            ("I love dogs", "I love dogs",    True,  1,   0.9, 0,  0),
            ("I love dogs", "I love canines", False, 0.9, 0.7, .34, 0), 
            ("I love dogs", "I like coffee",  False, 0.5, 0.4, .5, 0)
        ]
)