In [1]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
annot_results = json.load(open('/home/yl3427/cylab/AAVE/vllm/aave_llama_final.json'))
labels = [
    "Pre-verbal markers",  # 0
    "Verbal tense-number marking", # 1
    "Nouns and pronouns", # 2
    "Negation", # 3
    "Questions", # 4
    "Existential and locative construction", # 5
    "Lexical features", # 6
    "Phonological features", # 7
    "Out of list", # 8
]

LABEL = labels[8]

$$
P = \frac{2a}{2a + b + c}
$$

, where a is the number of identified features that both annotators agree, and b as well c is the number of identified features that only one annotator agrees.

In [85]:
def get_overlap_ratio(range1, range2):
    start = max(range1[0], range2[0])
    end = min(range1[1], range2[1])
    overlap = max(0, end - start)
    average_length = (range1[1] - range1[0] + range2[1] - range2[0]) / 2
    return overlap / average_length

a_s = {'total':0,} # Both(but from Lee)
b_s = {'total':0,} # only Lee
# c_s = {'total':0,} # only Chang
a_ids = []
both_ool = {}

threshold = 0.8

for i in range(len(annot_results)):
    assert annot_results[i]['annotations'][0]['completed_by'] == 4 # Lee
    assert annot_results[i]['annotations'][1]['completed_by'] == 2 # Chang
    annot_results[i]['annotations'].append({'completed_by':99, 'result':[]}) # intersection

    answers_lee = {"answer1": [], "answer2": [], "answer3": [], "answer4": [], "answer5": [], "answer6": []}
    answers_chang = {"answer1": [], "answer2": [], "answer3": [], "answer4": [], "answer5": [], "answer6": []}

    # Lee's positive cases
    for idx1 in range(len(annot_results[i]['annotations'][0]['result'])):
        marked_body = annot_results[i]['annotations'][0]['result'][idx1]
        if 'start' not in marked_body['value'].keys():
            continue
        assert marked_body['value']['start'] == marked_body['value']['end']
        
        if marked_body['value']['start'] == '0':
            answers_lee['answer1'].append(marked_body)
        elif marked_body['value']['start'] == '1':
            answers_lee['answer2'].append(marked_body)
        elif marked_body['value']['start'] == '2':
            answers_lee['answer3'].append(marked_body)
        elif marked_body['value']['start'] == '3':
            answers_lee['answer4'].append(marked_body)
        elif marked_body['value']['start'] == '4':
            answers_lee['answer5'].append(marked_body)
        elif marked_body['value']['start'] == '5':
            answers_lee['answer6'].append(marked_body)
    
    # Chang's positive cases
    for idx2 in range(len(annot_results[i]['annotations'][1]['result'])):
        marked_body = annot_results[i]['annotations'][1]['result'][idx2]
        if 'start' not in marked_body['value'].keys():
            continue
        assert marked_body['value']['start'] == marked_body['value']['end']
        
        if marked_body['value']['start'] == '0':
            answers_chang['answer1'].append(marked_body)
        elif marked_body['value']['start'] == '1':
            answers_chang['answer2'].append(marked_body)
        elif marked_body['value']['start'] == '2':
            answers_chang['answer3'].append(marked_body)
        elif marked_body['value']['start'] == '3':
            answers_chang['answer4'].append(marked_body)
        elif marked_body['value']['start'] == '4':
            answers_chang['answer5'].append(marked_body)
        elif marked_body['value']['start'] == '5':
            answers_chang['answer6'].append(marked_body)
    
    for answer in ['answer1', 'answer2', 'answer3', 'answer4', 'answer5', 'answer6']:
        for idx1 in range(len(answers_lee[answer])):
            label1 = answers_lee[answer][idx1]['value']['paragraphlabels'][0]
            if label1 != LABEL:
                continue
            text1 = answers_lee[answer][idx1]['value']['text']
            range1 = (answers_lee[answer][idx1]['value']['startOffset'], answers_lee[answer][idx1]['value']['endOffset'])

            
            agreed_candidates = []
            for idx2 in range(len(answers_chang[answer])):
                label2 = answers_chang[answer][idx2]['value']['paragraphlabels'][0]
                range2 = (answers_chang[answer][idx2]['value']['startOffset'], answers_chang[answer][idx2]['value']['endOffset'])
                overlap = get_overlap_ratio(range1, range2)
                if label1 == label2 and overlap > threshold:
                    if agreed_candidates and agreed_candidates[-1][-1] < overlap:
                        agreed_candidates.pop()
                    elif agreed_candidates and agreed_candidates[-1][-1] > overlap:
                        continue
                    agreed_candidates.append((idx1, idx2, overlap))

            if len(agreed_candidates) >= 1:
                a_s['total'] += 1
                a_s[label1] = a_s.get(label1, 0) + 1
                a_ids.append(answers_lee[answer][idx1]['id'])
                annot_results[i]['annotations'][2]['result'].append(answers_lee[answer][idx1])
            else:
                b_s['total'] += 1
                b_s[label1] = b_s.get(label1, 0) + 1

In [86]:
a_s = {'total':0,} # Both(but from Chang)
# b_s = {'total':0,} # only Lee
c_s = {'total':0,} # only Chang


threshold = 0.8

for i in range(len(annot_results)):
# for i in range(1):
    assert annot_results[i]['annotations'][0]['completed_by'] == 4 # Lee
    assert annot_results[i]['annotations'][1]['completed_by'] == 2 # Chang
    answers_lee = {"answer1": [], "answer2": [], "answer3": [], "answer4": [], "answer5": [], "answer6": []}
    answers_chang = {"answer1": [], "answer2": [], "answer3": [], "answer4": [], "answer5": [], "answer6": []}

    # Lee's positive cases
    for idx1 in range(len(annot_results[i]['annotations'][0]['result'])):
        marked_body = annot_results[i]['annotations'][0]['result'][idx1]['value']
        if 'start' not in marked_body.keys():
            continue
        assert marked_body['start'] == marked_body['end']
        
        if marked_body['start'] == '0':
            answers_lee['answer1'].append(marked_body)
        elif marked_body['start'] == '1':
            answers_lee['answer2'].append(marked_body)
        elif marked_body['start'] == '2':
            answers_lee['answer3'].append(marked_body)
        elif marked_body['start'] == '3':
            answers_lee['answer4'].append(marked_body)
        elif marked_body['start'] == '4':
            answers_lee['answer5'].append(marked_body)
        elif marked_body['start'] == '5':
            answers_lee['answer6'].append(marked_body)
    
    # Chang's positive cases
    for idx2 in range(len(annot_results[i]['annotations'][1]['result'])):
        marked_body = annot_results[i]['annotations'][1]['result'][idx2]['value']
        if 'start' not in marked_body.keys():
            continue
        assert marked_body['start'] == marked_body['end']
        
        if marked_body['start'] == '0':
            answers_chang['answer1'].append(marked_body)
        elif marked_body['start'] == '1':
            answers_chang['answer2'].append(marked_body)
        elif marked_body['start'] == '2':
            answers_chang['answer3'].append(marked_body)
        elif marked_body['start'] == '3':
            answers_chang['answer4'].append(marked_body)
        elif marked_body['start'] == '4':
            answers_chang['answer5'].append(marked_body)
        elif marked_body['start'] == '5':
            answers_chang['answer6'].append(marked_body)
    
    for answer in ['answer1', 'answer2', 'answer3', 'answer4', 'answer5', 'answer6']:
        for idx2 in range(len(answers_chang[answer])):
            label2 = answers_chang[answer][idx2]['paragraphlabels'][0]
            if label2 != LABEL:
                continue
            text2 = answers_chang[answer][idx2]['text']
            range2 = (answers_chang[answer][idx2]['startOffset'], answers_chang[answer][idx2]['endOffset'])

            agreed_candidates = []
            for idx1 in range(len(answers_lee[answer])):
                label1 = answers_lee[answer][idx1]['paragraphlabels'][0]
                range1 = (answers_lee[answer][idx1]['startOffset'], answers_lee[answer][idx1]['endOffset'])
                overlap = get_overlap_ratio(range1, range2)
                if label1 == label2 and overlap > threshold:
                    if agreed_candidates and agreed_candidates[-1][-1] < overlap:
                        agreed_candidates.pop()
                    elif agreed_candidates and agreed_candidates[-1][-1] > overlap:
                        continue
                    agreed_candidates.append((idx2, idx1, overlap))

            if len(agreed_candidates) > 1:
                print(f"file {i}, {answer}, for {label2}, {text2}")
                print(agreed_candidates)

            if len(agreed_candidates) >= 1:
                a_s['total'] += 1
                a_s[label2] = a_s.get(label2, 0) + 1
            else:
                c_s['total'] += 1
                c_s[label2] = c_s.get(label2, 0) + 1

file 16, answer1, for Out of list, Been
[(14, 18, 1.0), (14, 19, 1.0), (14, 20, 1.0), (14, 21, 1.0), (14, 22, 1.0)]
file 16, answer1, for Out of list, Been
[(15, 18, 1.0), (15, 19, 1.0), (15, 20, 1.0), (15, 21, 1.0), (15, 22, 1.0)]
file 16, answer1, for Out of list, Been
[(16, 18, 1.0), (16, 19, 1.0), (16, 20, 1.0), (16, 21, 1.0), (16, 22, 1.0)]
file 16, answer1, for Out of list, Been
[(17, 18, 1.0), (17, 19, 1.0), (17, 20, 1.0), (17, 21, 1.0), (17, 22, 1.0)]
file 16, answer1, for Out of list, Been
[(18, 18, 1.0), (18, 19, 1.0), (18, 20, 1.0), (18, 21, 1.0), (18, 22, 1.0)]


In [87]:
labels = [
    "Pre-verbal markers",
    "Verbal tense-number marking",
    "Nouns and pronouns",
    "Negation",
    "Questions",
    "Existential and locative construction",
    "Lexical features",
    "Phonological features",
    "Out of list",
]
iaa_score = {}
for label in labels:
    a = a_s.get(label, 0)
    b = b_s.get(label, 0)
    c = c_s.get(label, 0)
    support = a
    p = 2 * a / (2 * a + b + c) if a > 0 else 0
    print(f"{a}, {b}, {c}, {p}")
    iaa_score[label] = {"Agreement": p, "Support": support}

0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
0, 0, 0, 0
183, 90, 12, 0.782051282051282


In [88]:
iaa_score

{'Pre-verbal markers': {'Agreement': 0, 'Support': 0},
 'Verbal tense-number marking': {'Agreement': 0, 'Support': 0},
 'Nouns and pronouns': {'Agreement': 0, 'Support': 0},
 'Negation': {'Agreement': 0, 'Support': 0},
 'Questions': {'Agreement': 0, 'Support': 0},
 'Existential and locative construction': {'Agreement': 0, 'Support': 0},
 'Lexical features': {'Agreement': 0, 'Support': 0},
 'Phonological features': {'Agreement': 0, 'Support': 0},
 'Out of list': {'Agreement': 0.782051282051282, 'Support': 183}}

agree  

# Stats

### prompt  (total, feature) -> anova, t-test   

In [89]:
# total
prompt = ["CompP", "DemoP", "LingP", "BaseP"]

total_distribution = {"CompP": [], "DemoP": [], "LingP": [], "BaseP": []}

for i in range(len(annot_results)):
    answers = {"answer1": [], "answer2": [], "answer3": [], "answer4": [], "answer5": [], "answer6": []}

    for idx1 in range(len(annot_results[i]['annotations'][2]['result'])):
        marked_body = annot_results[i]['annotations'][2]['result'][idx1]['value']
        if 'start' not in marked_body.keys():
            continue
        
        if marked_body['start'] == '0':
            answers['answer1'].append(marked_body)
        elif marked_body['start'] == '1':
            answers['answer2'].append(marked_body)
        elif marked_body['start'] == '2':
            answers['answer3'].append(marked_body)
        elif marked_body['start'] == '3':
            answers['answer4'].append(marked_body)
        elif marked_body['start'] == '4':
            answers['answer5'].append(marked_body)
        elif marked_body['start'] == '5':
            answers['answer6'].append(marked_body)

    total_distribution[prompt[i%4]].append(len(answers['answer1']))
    total_distribution[prompt[i%4]].append(len(answers['answer2']))
    total_distribution[prompt[i%4]].append(len(answers['answer3']))
    total_distribution[prompt[i%4]].append(len(answers['answer4']))
    total_distribution[prompt[i%4]].append(len(answers['answer5']))
    total_distribution[prompt[i%4]].append(len(answers['answer6']))
                                

In [90]:
total_distribution
df = pd.DataFrame.from_dict(total_distribution)[['DemoP', 'LingP', 'CompP']]
df.to_csv(f'distribution_{LABEL}.csv', index=False)

In [91]:
import pandas as pd
pd.set_option('display.float_format', '{:.3f}'.format)
import pingouin as pg

# From Llama3
df_long = df.reset_index().melt(id_vars='index', var_name='condition', value_name='score')
df_long.rename(columns={'index': 'subject'}, inplace=True)

rm_anova = pg.rm_anova(dv='score', within='condition', subject='subject', data=df_long, detailed=True)
print(rm_anova)
rm_anova

      Source     SS  DF    MS     F  p-unc  p-GG-corr   ng2   eps sphericity  \
0  condition 19.389   2 9.694 8.418  0.001      0.001 0.140 0.846      False   
1      Error 80.611  70 1.152   NaN    NaN        NaN   NaN   NaN        NaN   

   W-spher  p-spher  
0    0.819    0.033  
1      NaN      NaN  


Unnamed: 0,Source,SS,DF,MS,F,p-unc,p-GG-corr,ng2,eps,sphericity,W-spher,p-spher
0,condition,19.389,2,9.694,8.418,0.001,0.001,0.14,0.846,False,0.819,0.033
1,Error,80.611,70,1.152,,,,,,,,


In [92]:
post_hoc = pg.pairwise_tests(dv='score', within='condition', subject='subject', 
                              data=df_long, parametric=True, alternative='two-sided', return_desc=True)
post_hoc

Unnamed: 0,Contrast,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,T,dof,alternative,p-unc,BF10,hedges
0,condition,CompP,DemoP,2.167,1.384,1.778,0.866,True,True,1.405,35.0,two-sided,0.169,0.441,0.333
1,condition,CompP,LingP,2.167,1.384,1.139,0.867,True,True,3.667,35.0,two-sided,0.001,38.069,0.881
2,condition,DemoP,LingP,1.778,0.866,1.139,0.867,True,True,3.333,35.0,two-sided,0.002,16.772,0.73


In [93]:
post_hoc = pg.pairwise_tests(dv='score', within='condition', subject='subject', 
                              data=df_long, parametric=True, alternative='greater', return_desc=True)
post_hoc

Unnamed: 0,Contrast,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,T,dof,alternative,p-unc,BF10,hedges
0,condition,CompP,DemoP,2.167,1.384,1.778,0.866,True,True,1.405,35.0,greater,0.084,0.881,0.333
1,condition,CompP,LingP,2.167,1.384,1.139,0.867,True,True,3.667,35.0,greater,0.0,76.139,0.881
2,condition,DemoP,LingP,1.778,0.866,1.139,0.867,True,True,3.333,35.0,greater,0.001,33.543,0.73


In [94]:
post_hoc = pg.pairwise_tests(dv='score', within='condition', subject='subject', 
                              data=df_long, parametric=True, alternative='less')
post_hoc

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,BF10,hedges
0,condition,CompP,DemoP,True,True,1.405,35.0,less,0.916,0.881,0.333
1,condition,CompP,LingP,True,True,3.667,35.0,less,1.0,0.013,0.881
2,condition,DemoP,LingP,True,True,3.333,35.0,less,0.999,0.03,0.73


In [95]:
LABEL

'Out of list'