### Process labelbox data

In [1]:
from google.colab import drive

drive.mount('/content/drive')

%cd drive/MyDrive/Colab Notebooks
#! ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks


In [2]:
import pandas as pd
import json

def process_labelbox(path):
    # Load your JSON data
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    res = []
    for d in data:
        labels = list(d['projects'].values())[0]['labels']

        for label in labels:
            annotations = label['annotations']
            classifications = annotations['classifications']
            NERs = annotations['objects']

            value = dict(
                global_key=d['data_row']['global_key'],
                content=d['data_row']['row_data'],
                createdBy=label['label_details']['created_by']
            )

            for classification in classifications:
                if classification['name'] == 'Generic':
                    value['generic'] = classification['radio_answer']['name']
                else:
                    for answer in classification['checklist_answers']:
                        value['granular_' + answer['name']] = 1

            value[f'explanation_span'] = [ner['location'] for ner in NERs]


            res.append(value)

    # Convert to pandas DataFrame
    df = pd.DataFrame(res)

    granular_cols = [col for col in df.columns if 'granular' in col]

    df[granular_cols] = df[granular_cols].fillna(0)

    return df

In [3]:
df = process_labelbox('export-result_formal_coding.ndjson')

In [4]:
df.drop(['createdBy'], axis=1)

Unnamed: 0,global_key,content,generic,granular_N/A,explanation_span,granular_Insult,granular_Toxicity,granular_Sexually explicit,granular_Identity attack,granular_Threat,granular_Profanity
0,1914491,can someone kill me please. i hate life and i ...,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
1,1853119,sorry ment equation and x is the amount of tim...,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
2,209082,female monkey y=0\nmale monkey x=0\nyou can te...,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
3,1216763,I kinda don't understand that very well. You c...,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
4,1200413,i dont care if you leave no one likes you jit,Unsafe,0.0,"[{'start': 25, 'end': 40}]",1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4494,1385106,Hahahahahahahaha How The Fudge Is Cyber Bullyi...,Safe,1.0,[],0.0,0.0,0.0,0.0,0.0,0.0
4495,1320254,Brandon this is for people who need help with ...,Unsafe,0.0,"[{'start': 63, 'end': 74}]",1.0,0.0,0.0,0.0,0.0,0.0
4496,710106,"you do have to divide, but not for part c*** s...",Unsafe,0.0,"[{'start': 40, 'end': 43}]",0.0,1.0,0.0,0.0,0.0,1.0
4497,1651871,you ugly you yo daddys son aye go crazy aye g...,Unsafe,0.0,"[{'start': 5, 'end': 8}, {'start': 17, 'end': ...",1.0,1.0,0.0,0.0,0.0,0.0


In [5]:
df.to_csv('labelbox_processed_R.csv', index=False)

### IRR calculation

In [9]:
import pandas as pd
import json

def process_labelbox_irr(path):
    # Load your JSON data
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    res = []
    for d in data:
        labels = list(d['projects'].values())[0]['labels']
        for label in labels:
            annotations = label['annotations']
            classifications = annotations['classifications']
            NERs = annotations['objects']

            value = dict(
                global_key=d['data_row']['global_key'],
                content=d['data_row']['row_data'],
                createdBy=list(d['projects'].values())[0]['labels'][0]['label_details']['created_by']
            )

            for classification in classifications:
                if classification['name'] == 'Generic':
                    value['generic'] = classification['radio_answer']['name']
                else:
                    for idx, answer in enumerate(classification['checklist_answers']):
                        value['granular_' + answer['name']] = 1

            value[f'explanation_span'] = [ner['location'] for ner in NERs]


            res.append(value)

    # Convert to pandas DataFrame
    df = pd.DataFrame(res)

    granular_cols = [col for col in df.columns if 'granular' in col]

    df[granular_cols] = df[granular_cols].fillna(0)

    return df

In [10]:
df_irr = process_labelbox_irr('export-result_irr.ndjson')