### Info
- Date: 2024-05-01
- Author: Reshama S
- Location: https://github.com/NoLaB-Lab/nlp-project1

### Description
- Evalute human vs ai transcripts

In [1]:
import evaluate
import pprint
from Levenshtein import distance
from Levenshtein import ratio

In [2]:
dir_human = "../data/transcripts-clinician/"
dir_ai = "../data/transcripts-whisper/"

dict_scores = {}

In [3]:
def evaltext(filename):
    print(filename)
    file = open(dir_human + filename + ".txt", "r")
    content_human = file.read()
    file.close()
    #print(content_human)
    #print("-" * 50)
    
    file_ai = open(dir_ai + filename + ".txt", "r")
    content_ai = file_ai.read()
    file_ai.close()
    #print(content_ai)
    #print("-" * 50)

    # load the metric (from Hugging Face)
    score = evaluate.load('rouge')
    #score = evaluate.load("accuracy") # this gives error

    results = score.compute(predictions=[content_ai],
                         references=[content_human])
    print(results)
    dict_scores[filename] = results

    disagreement = distance(content_human, content_ai)
    print(f"Levenshtein disagreement: {disagreement}")
    
    ratiov = ratio(content_human, content_ai)
    print(f"Levensshtein ratio: {ratiov}")

    # Calculate normalized distance (between 0 and 1)
    levenshtein_distance = distance(content_human, content_ai)
    print(f"Levenshtein distance: {levenshtein_distance}")
    sentence_length = max(len(content_human), len(content_ai))
    normalized_distance = levenshtein_distance / sentence_length

    print(f"Normalized Levenshtein distance: {normalized_distance}")

    print('-' * 50)
    
evaltext("AJ_IMG_3334")
evaltext("AJ_IMG_3335")
evaltext("AP_IMG_3383")
evaltext("AP_IMG_3384")
evaltext("BM_IMG_3480")
evaltext("BM_IMG_3481")
evaltext("MW_IMG_3200")
evaltext("MW_IMG_3201")
evaltext("PG_IMG_3189")
evaltext("PG_IMG_3190")
evaltext("RF_IMG_3240")
evaltext("RF_IMG_3241")
evaltext("SS_IMG_2862")
evaltext("SS_IMG_2863")

AJ_IMG_3334
{'rouge1': 0.912751677852349, 'rouge2': 0.8224719101123596, 'rougeL': 0.9038031319910516, 'rougeLsum': 0.9038031319910516}
Levenshtein disagreement: 156
Levensshtein ratio: 0.9153952843273232
Levenshtein distance: 156
Normalized Levenshtein distance: 0.14156079854809436
--------------------------------------------------
AJ_IMG_3335
{'rouge1': 0.8080229226361031, 'rouge2': 0.7146974063400576, 'rougeL': 0.8080229226361031, 'rougeLsum': 0.8080229226361031}
Levenshtein disagreement: 303
Levensshtein ratio: 0.8016928657799275
Levenshtein distance: 303
Normalized Levenshtein distance: 0.3159541188738269
--------------------------------------------------
AP_IMG_3383
{'rouge1': 0.8942307692307694, 'rouge2': 0.8019323671497584, 'rougeL': 0.8749999999999999, 'rougeLsum': 0.8894230769230769}
Levenshtein disagreement: 226
Levensshtein ratio: 0.8469493278179938
Levenshtein distance: 226
Normalized Levenshtein distance: 0.22944162436548224
------------------------------------------------

### ROUGE score
- A ROUGE score close to zero indicates poor similarity between candidate and references. 
- A ROUGE score close to one indicates strong similarity between candidate and references. 
- If candidate is identical to one of the reference documents, then score is 1.

### Levenshtein score
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#distance

In [None]:
#dict_scores

In [4]:
#Ref: https://towardsdatascience.com/side-by-side-comparison-of-strings-in-python-b9491ac858

import difflib
import re

def tokenize(s):
    return re.split('\s+', s)
def untokenize(ts):
    return ' '.join(ts)
        
def equalize(s1, s2):
    l1 = tokenize(s1)
    l2 = tokenize(s2)
    res1 = []
    res2 = []
    prev = difflib.Match(0,0,0)
    for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (prev.a + prev.size != match.a):
            for i in range(prev.a + prev.size, match.a):
                res2 += ['_' * len(l1[i])]
            res1 += l1[prev.a + prev.size:match.a]
        if (prev.b + prev.size != match.b):
            for i in range(prev.b + prev.size, match.b):
                res1 += ['_' * len(l2[i])]
            res2 += l2[prev.b + prev.size:match.b]
        res1 += l1[match.a:match.a+match.size]
        res2 += l2[match.b:match.b+match.size]
        prev = match
    return untokenize(res1), untokenize(res2)

def insert_newlines(string, every=64, window=10):
    result = []
    from_string = string
    while len(from_string) > 0:
        cut_off = every
        if len(from_string) > every:
            while (from_string[cut_off-1] != ' ') and (cut_off > (every-window)):
                cut_off -= 1
        else:
            cut_off = len(from_string)
        part = from_string[:cut_off]
        result += [part]
        from_string = from_string[cut_off:]
    return result

def show_comparison(s1, s2, width=40, margin=10, sidebyside=True, compact=False):
    s1, s2 = equalize(s1,s2)

    if sidebyside:
        s1 = insert_newlines(s1, width, margin)
        s2 = insert_newlines(s2, width, margin)
        if compact:
            for i in range(0, len(s1)):
                lft = re.sub(' +', ' ', s1[i].replace('_', '')).ljust(width)
                rgt = re.sub(' +', ' ', s2[i].replace('_', '')).ljust(width) 
                print(lft + ' | ' + rgt + ' | ')        
        else:
            for i in range(0, len(s1)):
                lft = s1[i].ljust(width)
                rgt = s2[i].ljust(width)
                print(lft + ' | ' + rgt + ' | ')
    else:
        print(s1)
        print(s2)

In [8]:
def comptext(filename):
    print(filename)
    file = open(dir_human + filename + ".txt", "r")
    content_human = file.read()
    file.close()
    #print(content_human)
    #print("-" * 50)
    
    file_ai = open(dir_ai + filename + ".txt", "r")
    content_ai = file_ai.read()
    file_ai.close()
    #print(content_ai)
    #print("-" * 50)

    print('-' * 50)
    show_comparison(content_human, content_ai, width=50, sidebyside=True, compact=False)

In [9]:
# highest score
evaltext("RF_IMG_3241")
comptext("RF_IMG_3241")

RF_IMG_3241
{'rouge1': 0.9210526315789473, 'rouge2': 0.847682119205298, 'rougeL': 0.9210526315789473, 'rougeLsum': 0.9144736842105264}
Levenshtein disagreement: 81
Levensshtein ratio: 0.9309878213802436
Levenshtein distance: 81
Normalized Levenshtein distance: 0.10857908847184987
--------------------------------------------------
RF_IMG_3241
--------------------------------------------------
 I'm also going to ask you to tell me a story      |  I'm also going to ask you to tell me a story      | 
about a bad, hopefully not too traumatic, but      | about a bad, hopefully not too traumatic, but      | 
like a bad childhood memory _______ if you have    | like a bad childhood ______ memory, if you have    | 
any ____ or adolescent ____________ something      | ___ any, or __________ adolescence, something      | 
that happened that wasn't too great ______ ___ I   | that happened that wasn't ___ _____ great. Oh, I   | 
remember when I was very young. well ______ _____  | remember when I w

In [10]:
# lowest score
evaltext("SS_IMG_2863")
comptext("SS_IMG_2863")

SS_IMG_2863
{'rouge1': 0.7400881057268723, 'rouge2': 0.5866666666666667, 'rougeL': 0.7312775330396477, 'rougeLsum': 0.7312775330396477}
Levenshtein disagreement: 208
Levensshtein ratio: 0.7803521779425394
Levenshtein distance: 208
Normalized Levenshtein distance: 0.3382113821138211
--------------------------------------------------
SS_IMG_2863
--------------------------------------------------
Now One  ___ ___ more thing I'm going ____ ___     | ___ ___  And one more thing ___ _____ I'll ask     | 
___ to have you tell me _____ is a childhood       | you to ____ ___ tell me about is a childhood       | 
memory that wasn't good, that was sad, ____ _ ___  | memory that wasn't good, ____ ___ ____ like a sad  | 
___ ___ _____ hopefully not too dramatic Yeah      | or, you know, hopefully not too ________ ____      | 
__________ ___ ____ _ ___ ______ ____ __________   | traumatic, but like a bad memory from childhood.   | 
_____ I have ___ _ ______ fairly good, you know,   | Yeah. I ____ ha