### Process labelbox data

In [71]:
import pandas as pd
import json

def process_labelbox(path):
    # Load your JSON data
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    res = []
    for d in data:
        labels = list(d['projects'].values())[0]['labels']

        for label in labels:
            annotations = label['annotations']
            classifications = annotations['classifications']
            NERs = annotations['objects']

            value = dict(
                global_key=d['data_row']['global_key'],
                content=d['data_row']['row_data'],
                createdBy=label['label_details']['created_by']
            )

            for classification in classifications:
                if classification['name'] == 'Generic':
                    value['generic'] = classification['radio_answer']['name']
                else:
                    for answer in classification['checklist_answers']:
                        value['granular_' + answer['name']] = 1

            value[f'explanation_span'] = [ner['location'] for ner in NERs]
            

            res.append(value)

    # Convert to pandas DataFrame
    df = pd.DataFrame(res)

    granular_cols = [col for col in df.columns if 'granular' in col]

    df[granular_cols] = df[granular_cols].fillna(0)

    return df

In [72]:
df = process_labelbox('./labelbox/export-result_formal_coding.ndjson')
df

Unnamed: 0,global_key,content,createdBy,generic,granular_N/A,explanation_span,granular_Insult,granular_Toxicity,granular_Sexually explicit,granular_Identity attack,granular_Threat,granular_Profanity
0,1914491,can someone kill me please. i hate life and i ...,jing.lyu@ufl.edu,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
1,1853119,sorry ment equation and x is the amount of tim...,jing.lyu@ufl.edu,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
2,209082,female monkey y=0\nmale monkey x=0\nyou can te...,jing.lyu@ufl.edu,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
3,1216763,I kinda don't understand that very well. You c...,jing.lyu@ufl.edu,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
4,1200413,i dont care if you leave no one likes you jit,jing.lyu@ufl.edu,Unsafe,0.0,"[{'start': 25, 'end': 40}]",1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4494,1385106,Hahahahahahahaha How The Fudge Is Cyber Bullyi...,jing.lyu@ufl.edu,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
4495,1320254,Brandon this is for people who need help with ...,jing.lyu@ufl.edu,Unsafe,0.0,"[{'start': 63, 'end': 74}]",1.0,0.0,0.0,0.0,0.0,0.0
4496,710106,"you do have to divide, but not for part c*** s...",jing.lyu@ufl.edu,Unsafe,0.0,"[{'start': 40, 'end': 43}]",0.0,1.0,0.0,0.0,0.0,1.0
4497,1651871,you ugly you yo daddys son aye go crazy aye g...,jing.lyu@ufl.edu,Unsafe,0.0,"[{'start': 5, 'end': 8}, {'start': 17, 'end': ...",1.0,1.0,0.0,0.0,0.0,0.0


In [70]:
df.createdBy.value_counts()

createdBy
zhang.zhen@ufl.edu    1126
jing.lyu@ufl.edu      1125
jia.ma@ufl.edu        1125
oh.h@ufl.edu          1123
Name: count, dtype: int64

In [46]:
df.to_csv('./labelbox_processed.csv', index=False)

### IRR calculation

In [57]:
import pandas as pd
import json

def process_labelbox(path):
    # Load your JSON data
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    res = []
    for d in data:
        labels = list(d['projects'].values())[0]['labels']
        for label in labels:
            annotations = label['annotations']
            classifications = annotations['classifications']
            NERs = annotations['objects']

            value = dict(
                global_key=d['data_row']['global_key'],
                content=d['data_row']['row_data'],
                createdBy=list(d['projects'].values())[0]['labels'][0]['label_details']['created_by']
            )

            for classification in classifications:
                if classification['name'] == 'Generic':
                    value['generic'] = classification['radio_answer']['name']
                else:
                    for idx, answer in enumerate(classification['checklist_answers']):
                        value['granular_' + answer['name']] = 1

            value[f'explanation_span'] = [ner['location'] for ner in NERs]
            

            res.append(value)

    # Convert to pandas DataFrame
    df = pd.DataFrame(res)

    granular_cols = [col for col in df.columns if 'granular' in col]

    df[granular_cols] = df[granular_cols].fillna(0)

    return df

In [58]:
df = process_labelbox_irr('./labelbox/export-result_irr.ndjson')
df

Unnamed: 0,global_key,content,createdBy,granular_N/A,generic,explanation_span,granular_Insult,granular_Profanity,granular_Sexually explicit,granular_Threat,granular_Toxicity,granular_Identity attack
0,1531677,it sucks because i missed 4 days of learning f...,oh.h@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
1,335283,"Omg, yes im so stupid, I thought 13 + 9 is 24....",jia.ma@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
2,335283,"Omg, yes im so stupid, I thought 13 + 9 is 24....",jia.ma@ufl.edu,0.0,Unsafe,[],1.0,0.0,0.0,0.0,0.0,0.0
3,335283,"Omg, yes im so stupid, I thought 13 + 9 is 24....",jia.ma@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
4,335283,"Omg, yes im so stupid, I thought 13 + 9 is 24....",jia.ma@ufl.edu,0.0,Unsafe,"[{'start': 15, 'end': 20}, {'start': 71, 'end'...",1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1996,1621342,ok let me sink that into my brain and see if t...,jia.ma@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
1997,526253,"AMANDA , WATCH YOUR LANGUAGE , ABBREVIATION F...",jia.ma@ufl.edu,0.0,Unsafe,"[{'start': 49, 'end': 53}]",0.0,0.0,0.0,0.0,1.0,0.0
1998,526253,"AMANDA , WATCH YOUR LANGUAGE , ABBREVIATION F...",jia.ma@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
1999,526253,"AMANDA , WATCH YOUR LANGUAGE , ABBREVIATION F...",jia.ma@ufl.edu,1.0,Safe,[],0.0,0.0,0.0,0.0,0.0,0.0
