In [2]:
LABELS = {
    'Incomplete Thought',
    'Self Correction',
    'Clarification',
    'Generic Disfluency',
    'Misspeak',
    'Unclear',
    'Overlap'
}

In [3]:
import os

project = "s1046-50_s2012-13_s3026-50"
reviews_dir = f"../data/{project}/REVIEW/"
# Get all .json files in REVIEW dir
json_files = [f for f in os.listdir(reviews_dir) if os.path.splitext(f)[1] == '.json']
test_file = "2013_211.txt.json"
hoarder_flag = int(test_file[0] == '0')

In [4]:
import json

with open(reviews_dir + test_file) as f:
    raw_data = json.load(f)

data = raw_data['data']
data.keys()

dict_keys(['project', 'document', 'kinds', 'rows', 'labelerInfo', 'labelSets', 'spanLabels', 'arrowLabels', 'boundingBoxLabels', 'timeLabels', 'comments', 'url'])

In [5]:
data['project'], data['document'], data['kinds']

({'id': 'MzQwMWJkM2Y', 'name': 's1046-50_s2012-13_s3026-50'},
 {'id': '3969c434-a2ab-4d4c-8709-4b42f87be297', 'name': '2013_211.txt'},
 ['TOKEN_BASED'])

In [7]:
INTERVIEWER_NAMES = ["Interviewer", "Rebecca"]
PARTICIPANT_NAMES = ["Participant", "Interviewee"]

In [8]:
row_speakers = [''] * len(data['rows'])
speaker = ""
for i in range(len(data['rows'])):
    row = data['rows'][i]
    for column in row:
        if (column['content'].find(":") != -1):
            slice_with_potential_speaker: str = column['content'].split(":")[0].title()
            speaker_found = False
            for name in INTERVIEWER_NAMES:
                if name in slice_with_potential_speaker:
                    speaker = INTERVIEWER_NAMES[0]
                    break
            # Don't look for participant name if we already found the interviewer
            if not speaker_found: 
                for name in PARTICIPANT_NAMES:
                    if name in slice_with_potential_speaker:
                        speaker = PARTICIPANT_NAMES[0]
                        break
        row_speakers[i] = speaker
row_speakers
# speakers = set([d['speaker'] for row in rows for d in row])
# assert len(set([d['speaker'] for row in rows for d in row])) == 2 # make sure there are only two speakers in each row
# speakers

['Interviewer',
 'Interviewer',
 'Interviewer',
 'Interviewer',
 'Participant',
 'Participant',
 'Participant',
 'Interviewer',
 'Interviewer',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Interviewer',
 'Interviewer',
 'Participant',
 'Participant',
 'Participant',
 'Participant',
 'Participant']

In [9]:
# This is where the meat is at
label_data = data['spanLabels']
label_data

[{'id': '1694748133',
  'labelSetIndex': 0,
  'labeledBy': 'REVIEWER',
  'labeledByUserId': 7650,
  'acceptedByUserId': None,
  'rejectedByUserId': None,
  'layer': 0,
  'counter': 0,
  'status': 'LABELED',
  'hashCode': 'SPAN:-JMh1Z2QiTPafZ-k12D4R:0:4:0:1:0:4:0:3:7:0:undefined:undefined',
  'labelName': '-JMh1Z2QiTPafZ-k12D4R',
  'labelItem': {'id': '-JMh1Z2QiTPafZ-k12D4R',
   'labelName': 'Generic Disfluency'},
  'textPosition': {'start': {'row': 4,
    'column': 0,
    'tokenIndex': 1,
    'charIndex': 0},
   'end': {'row': 4, 'column': 0, 'tokenIndex': 3, 'charIndex': 7}},
  'confidenceScore': None}]

In [10]:
labels_with_speakers = [('', False)] * len(label_data)
for i in range(len(label_data)):
    label = label_data[i]

    row_index = label['textPosition']['start']['row']
    speaker = row_speakers[row_index]
    labels_with_speakers[i] = (label['labelItem']['labelName'], speaker)
labels_with_speakers

[('Generic Disfluency', 'Participant')]

In [11]:
from collections import Counter
from itertools import product

cntDict = Counter(labels_with_speakers)
for label, speaker in set(product(LABELS, ['Interviewer', 'Participant'])).difference(cntDict.keys()):
    cntDict[(label, speaker)] = 0
cntDict

Counter({('Generic Disfluency', 'Participant'): 1,
         ('Clarification', 'Interviewer'): 0,
         ('Unclear', 'Participant'): 0,
         ('Generic Disfluency', 'Interviewer'): 0,
         ('Incomplete Thought', 'Participant'): 0,
         ('Overlap', 'Participant'): 0,
         ('Self Correction', 'Interviewer'): 0,
         ('Misspeak', 'Interviewer'): 0,
         ('Incomplete Thought', 'Interviewer'): 0,
         ('Clarification', 'Participant'): 0,
         ('Unclear', 'Interviewer'): 0,
         ('Overlap', 'Interviewer'): 0,
         ('Misspeak', 'Participant'): 0,
         ('Self Correction', 'Participant'): 0})

In [12]:
display_dict = {}
for label in LABELS:
    display_dict[label+'–Interviewer'] = cntDict[(label, 'Interviewer')]
    display_dict[label+'–Participant'] = cntDict[(label, 'Participant')]
    display_dict[label+'–Total'] = cntDict[(label, 'Interviewer')] + cntDict[(label, 'Participant')]
display_dict['Total'] = sum(cntDict.values())
display_dict

{'Incomplete Thought–Interviewer': 0,
 'Incomplete Thought–Participant': 0,
 'Incomplete Thought–Total': 0,
 'Misspeak–Interviewer': 0,
 'Misspeak–Participant': 0,
 'Misspeak–Total': 0,
 'Clarification–Interviewer': 0,
 'Clarification–Participant': 0,
 'Clarification–Total': 0,
 'Unclear–Interviewer': 0,
 'Unclear–Participant': 0,
 'Unclear–Total': 0,
 'Self Correction–Interviewer': 0,
 'Self Correction–Participant': 0,
 'Self Correction–Total': 0,
 'Generic Disfluency–Interviewer': 0,
 'Generic Disfluency–Participant': 1,
 'Generic Disfluency–Total': 1,
 'Overlap–Interviewer': 0,
 'Overlap–Participant': 0,
 'Overlap–Total': 0,
 'Total': 1}

In [19]:
assert not [row for row in data['rows'] if len(row) != 1] # make sure each row is a singleton list
doc_tokens = [row[0]['tokens'] for row in data['rows']]
flat_doc_tokens = [token for row in doc_tokens for token in row]
len(set(flat_doc_tokens)) / len(flat_doc_tokens) # TTR

0.5252747252747253

In [26]:
words = 0
for sent in doc_tokens:
    words += len(sent)
words / len(doc_tokens) # average sentence length

18.958333333333332

In [126]:
row = {
    'filename' : data['document']['name'], ''
    'isHoarder' : hoarder_flag, 
    **display_dict
}
row

{'filename': '2013_211.txt',
 'isHoarder': 0,
 'Incomplete Thought - Interviewer': 0,
 'Incomplete Thought - Participant': 0,
 'Incomplete Thought - Total': 0,
 'Clarification - Interviewer': 0,
 'Clarification - Participant': 0,
 'Clarification - Total': 0,
 'Generic Disfluency - Interviewer': 0,
 'Generic Disfluency - Participant': 1,
 'Generic Disfluency - Total': 1,
 'Overlap - Interviewer': 0,
 'Overlap - Participant': 0,
 'Overlap - Total': 0,
 'Unclear - Interviewer': 0,
 'Unclear - Participant': 0,
 'Unclear - Total': 0,
 'Misspeak - Interviewer': 0,
 'Misspeak - Participant': 0,
 'Misspeak - Total': 0,
 'Self Correction - Interviewer': 0,
 'Self Correction - Participant': 0,
 'Self Correction - Total': 0,
 'Total': 1}

In [70]:
import pandas as pd

pd.DataFrame([row])

Unnamed: 0,filename,isHoarder,Incomplete Thought - Interviewer,Incomplete Thought - Participant,Incomplete Thought - Total,Clarification - Interviewer,Clarification - Participant,Clarification - Total,Generic Disfluency - Interviewer,Generic Disfluency - Participant,...,Unclear - Interviewer,Unclear - Participant,Unclear - Total,Misspeak - Interviewer,Misspeak - Participant,Misspeak - Total,Self Correction - Interviewer,Self Correction - Participant,Self Correction - Total,Total
0,2013_211.txt,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
