### Info
- Date: 2024-06-12
- Author: Reshama S
- Location: https://github.com/NoLaB-Lab/nlp-project1

### Description
- Evalute human vs ai transcripts

### ROUGE score
- A ROUGE score close to zero indicates poor similarity between candidate and references. 
- A ROUGE score close to one indicates strong similarity between candidate and references. 
- If candidate is identical to one of the reference documents, then score is 1.

### Levenshtein score
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#distance

In [1]:
import evaluate
import pprint
from Levenshtein import distance
from Levenshtein import ratio

In [2]:
dir_human = "../data/transcripts-clinician/"
dir_ai = "../data/transcripts-whisper/"

dict_scores = {}

In [3]:
#Ref: https://towardsdatascience.com/side-by-side-comparison-of-strings-in-python-b9491ac858

import difflib
import re

def tokenize(s):
    return re.split('\s+', s)
def untokenize(ts):
    return ' '.join(ts)
        
def equalize(s1, s2):
    l1 = tokenize(s1)
    l2 = tokenize(s2)
    res1 = []
    res2 = []
    prev = difflib.Match(0,0,0)
    for match in difflib.SequenceMatcher(a=l1, b=l2).get_matching_blocks():
        if (prev.a + prev.size != match.a):
            for i in range(prev.a + prev.size, match.a):
                res2 += ['_' * len(l1[i])]
            res1 += l1[prev.a + prev.size:match.a]
        if (prev.b + prev.size != match.b):
            for i in range(prev.b + prev.size, match.b):
                res1 += ['_' * len(l2[i])]
            res2 += l2[prev.b + prev.size:match.b]
        res1 += l1[match.a:match.a+match.size]
        res2 += l2[match.b:match.b+match.size]
        prev = match
    return untokenize(res1), untokenize(res2)

def insert_newlines(string, every=64, window=10):
    result = []
    from_string = string
    while len(from_string) > 0:
        cut_off = every
        if len(from_string) > every:
            while (from_string[cut_off-1] != ' ') and (cut_off > (every-window)):
                cut_off -= 1
        else:
            cut_off = len(from_string)
        part = from_string[:cut_off]
        result += [part]
        from_string = from_string[cut_off:]
    return result

def show_comparison(s1, s2, width=40, margin=10, sidebyside=True, compact=False):
    s1, s2 = equalize(s1,s2)

    if sidebyside:
        s1 = insert_newlines(s1, width, margin)
        s2 = insert_newlines(s2, width, margin)
        if compact:
            for i in range(0, len(s1)):
                lft = re.sub(' +', ' ', s1[i].replace('_', '')).ljust(width)
                rgt = re.sub(' +', ' ', s2[i].replace('_', '')).ljust(width) 
                print(lft + ' | ' + rgt + ' | ')        
        else:
            for i in range(0, len(s1)):
                lft = s1[i].ljust(width)
                rgt = s2[i].ljust(width)
                print(lft + ' | ' + rgt + ' | ')
    else:
        print(s1)
        print(s2)

In [4]:
def readtext(filename):
    #print(filename)
    file = open(dir_human + filename + ".txt", "r")
    content_human = file.read()
    file.close()
    #print(content_human)
    #print("-" * 50)
    
    file_ai = open(dir_ai + filename + ".txt", "r")
    content_ai = file_ai.read()
    file_ai.close()
    #print(content_ai)
    #print("-" * 50)
    return filename, content_human, content_ai

In [5]:
def evaltext(filename, content_human, content_ai):
    #print(filename)
    # load the metric (from Hugging Face)
    score = evaluate.load('rouge')
    #score = evaluate.load("accuracy") # this gives error

    results = score.compute(predictions=[content_ai],
                         references=[content_human])
    print(results)
    dict_scores[filename] = results

    disagreement = distance(content_human, content_ai)
    print(f"Levenshtein disagreement: {disagreement}")
    
    ratiov = ratio(content_human, content_ai)
    print(f"Levensshtein ratio: {ratiov}")

    # Calculate normalized distance (between 0 and 1)
    levenshtein_distance = distance(content_human, content_ai)
    print(f"Levenshtein distance: {levenshtein_distance}")
    sentence_length = max(len(content_human), len(content_ai))
    normalized_distance = levenshtein_distance / sentence_length

    print(f"Normalized Levenshtein distance: {normalized_distance}")

    print('-' * 52)


In [6]:
def comptext(filename, content_human, content_ai):
    show_comparison(content_human, content_ai, width=50, sidebyside=True, compact=False)

In [7]:
def runanalysis(patientnum, printcomp):
    print(patientnum)
    filename, content_human, content_ai = readtext(patientnum)
    evaltext(filename, content_human, content_ai)
    if printcomp == 1:
        comptext(filename, content_human, content_ai)


In [8]:
runanalysis("AJ_IMG_3334", printcomp=0)
runanalysis("AJ_IMG_3335", printcomp=0)

runanalysis("AP_IMG_3383", printcomp=0)
runanalysis("AP_IMG_3384", printcomp=0)

runanalysis("BM_IMG_3480", printcomp=0)
runanalysis("BM_IMG_3481", printcomp=0)

# don't see these recordings in data-media directory, but clinician transcripts are available
#runanalysis("HC1_RF_IMG_3240", printcomp=0)
#runanalysis("HC1_RF_IMG_3241", printcomp=0)

# typo in file name (fixed in code repo)
# this file has a sort of UTC8 error: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 116: invalid continuation byte
# update filename, remove . to _
#runanalysis("HC2_SD_5_7_24_BestTrip", printcomp=0)
#runanalysis("HC2_SD_5_7_24_FirstJob", printcomp=0)

runanalysis("MW_IMG_3200", printcomp=0)
runanalysis("MW_IMG_3201", printcomp=0)

# recording: YES; clinician transcript: NO
#runanalysis("PB_2.27.24_BestTrip", printcomp=0)

runanalysis("PG_IMG_3189", printcomp=0)
runanalysis("PG_IMG_3190", printcomp=0)

# recording: YES; clinician transcript: NO
#runanalysis("RF_IMG_3240", printcomp=0)
#runanalysis("RF_IMG_3241", printcomp=0)

# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 97: invalid continuation byte
#runanalysis("SF_2.6.24_BestTrip", printcomp=0)

# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 82: invalid continuation byte
#runanalysis("SF_2.6.24_Childhood_memory", printcomp=0)

# clinician transcript name changed from 04.02.2024 to 04.03.2024
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 57: invalid continuation byte
#runanalysis("SG_BestTrip_04.03.2024", printcomp=0)

#UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 3: invalid continuation byte
#runanalysis("SG_FirstJob_04.03.2024", printcomp=0)

runanalysis("SS_IMG_2862", printcomp=0)
runanalysis("SS_IMG_2863", printcomp=0)




AJ_IMG_3334
{'rouge1': 0.945054945054945, 'rouge2': 0.8741721854304636, 'rougeL': 0.9362637362637363, 'rougeLsum': 0.9362637362637363}
Levenshtein disagreement: 120
Levensshtein ratio: 0.9311445508435933
Levenshtein distance: 120
Normalized Levenshtein distance: 0.10879419764279238
----------------------------------------------------
AJ_IMG_3335
{'rouge1': 0.8763440860215055, 'rouge2': 0.7891891891891892, 'rougeL': 0.8709677419354839, 'rougeLsum': 0.8709677419354839}
Levenshtein disagreement: 175
Levensshtein ratio: 0.8883903533370724
Levenshtein distance: 175
Normalized Levenshtein distance: 0.18248175182481752
----------------------------------------------------
AP_IMG_3383
{'rouge1': 0.9320388349514563, 'rouge2': 0.848780487804878, 'rougeL': 0.9126213592233011, 'rougeLsum': 0.9126213592233011}
Levenshtein disagreement: 146
Levensshtein ratio: 0.8980021030494216
Levenshtein distance: 146
Normalized Levenshtein distance: 0.15320041972717732
--------------------------------------------

In [9]:
#runanalysis("AJ_IMG_3334", printcomp=1)

In [10]:
# highest score
#runanalysis("RF_IMG_3241", printcomp=1)

In [11]:
# lowest score
#runanalysis("SS_IMG_2863", printcomp=1)

In [12]:
# Baseline: both texts are the same
#runanalysis("test", printcomp=1)

In [13]:
# remove commas
#runanalysis("test1", printcomp=1)

In [14]:
# remove all periods
#runanalysis("test2", printcomp=1)

In [15]:
# texting types of words
#runanalysis("test3", printcomp=1)

## Lowest patient score

In [16]:
# lowest score
#runanalysis("SS_IMG_2863-ptonly", printcomp=1)

In [17]:
# lowest score
# check clinical vs Assembly AI
# Note: manually removed Clinician 
# Conclusion: Assembly AI has LOWER accuracy in transcription than Whisper
# Assembly AI:  
# Whisper AI:  

#runanalysis("assemblyai-1speaker-2863", printcomp=1)

In [18]:
# lowest score
#runanalysis("SS_IMG_2863", printcomp=1)

In [19]:
# patient text only (exclude clinicians)
#runanalysis("SS_IMG_2863-ptonly", printcomp=1)


In [20]:
# fix spelling: 
# remeber   ==> remember
# wasnt     ==> wasn't
# andall    ==> and all
runanalysis("SS_IMG_2863-ptonly-clean", printcomp=1)

SS_IMG_2863-ptonly-clean


FileNotFoundError: [Errno 2] No such file or directory: '../data/transcripts-clinician/SS_IMG_2863-ptonly-clean.txt'

In [None]:
# fix  
#     ==>  
#      ==>  
#      ==>  
runanalysis("SS_IMG_2863-ptonly-clean2", printcomp=1)

In [None]:
#dict_scores