In [1]:
import utils.datasaur as data

  from .autonotebook import tqdm as notebook_tqdm
2025-08-28 22:26:43 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 434kB [00:00, 1.52MB/s]                    
2025-08-28 22:26:43 INFO: Downloaded file to /home/danim/stanza_resources/resources.json
2025-08-28 22:26:44 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2025-08-28 22:26:45 INFO: Using device: cuda
2025-08-28 22:26:45 INFO: Loading: tokenize
2025-08-28 22:26:48 INFO: Loading: mwt
2025-08-28 22:26:48 INFO: Loading: pos
2025-08-28 22:26:49 INF

In [2]:
%load_ext autoreload
%autoreload 2

# Misspellings/Wrong Labels

First thing we're going to do is deal with the issue of misspellings. 
Inspired by Tava's function in `regexes.py`, we're going to use `get_close_matches` to look for words that are close to 

In [3]:
doc_token_content = [
    (doc, [
        (doc.row_data[i]["tokens"], i) 
        for i in range(len(doc.row_data))
    ])
    for doc in data.by_doc 
]

In [4]:
from difflib import get_close_matches
import utils.regexes as rxs
import string
import re

threshold = 0.85
BLACKLIST = ['Interview', 'Participants', 'Interviewees', 'Interviewer 1', 
             'Interviewer 2', 'Participate', '"Participant',
             'Broparticipant', 'Taparticipant'] # ??? Found these have no idea what happened here
potential_spelling_issues = [
    {
        "broken_label" : token_fixed, # fixed due to processing, broken due to misspelling or other error
        "close_to" : matches,
        "doc" : doc,
        "row_num" : row_num,
        "prev_line" : [doc.lines[row_num-1]],
        "line" : doc.lines[row_num]
    }
    for doc, token_list in doc_token_content
    for tokenized_line, row_num in token_list
    for token in tokenized_line
    if (matches := get_close_matches(token, rxs.SPEAKERS, 
                                     cutoff=threshold))
    and (token_fixed := token.title().rstrip(string.punctuation)) 
        not in BLACKLIST
    and not any(token_fixed == match for match in matches)
]
potential_spelling_issues

[{'broken_label': '1:Interviewer',
  'close_to': ['Interviewer'],
  'doc': Document(3001_076.txt, s1062_s2022-26_s3076-97),
  'row_num': 9,
  'prev_line': ['Speaker: In my mind I almost feel like you can do both in one but it would probably need to be longer than three minutes. I don’t know how your content would go but I feel like a kids one would be a little different than what a parent’s one would be. It would be more a little factual based whereas a kid one would be more experienced based. The role playing, like what it’s going to look like. Again you said you didn’t know if this was just going to be pediatrics right? The patients could also be adults.'],
  'line': '1:Interviewer: No. Sorry I may have misspoke. All the patients will be pediatric. The only thing we haven’t decided on what whether or not it would be one video or two separate ones.'},
 {'broken_label': '1:Interviewer',
  'close_to': ['Interviewer'],
  'doc': Document(3001_076.txt, s1062_s2022-26_s3076-97),
  'row_num'

In [5]:
# There is no issue that is close to more than one speaker label
not any(len(item["close_to"]) != 1 for item in potential_spelling_issues)

True

In [6]:
# Find all instances of this one particular issue
all_doc_content_lol = '\n'.join(doc.full_content for doc in data.by_doc)
weird_number_in_front = re.findall(r'\d:[a-zA-Z]+', all_doc_content_lol)
weird_number_in_front

['1:Interviewer',
 '1:Speaker',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Interviewer',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer',
 '1:Speaker',
 '1:Interviewer']

Fixing this:

In [7]:
list(map(lambda s : s.replace('1:', ''), weird_number_in_front))

['Interviewer',
 'Speaker',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Interviewer',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer',
 'Speaker',
 'Interviewer']

In [8]:
# And these ones
number_issues = []
prepended = re.findall(r'\d(?:{sl})'.format(sl='|'.join(rxs.SPEAKERS)), 
                       all_doc_content_lol)
appended = re.findall(r'(?:{sl})\d'.format(sl='|'.join(rxs.SPEAKERS)), 
                      all_doc_content_lol)
number_issues = list(set(prepended) | set(appended))
number_issues

['Interviewer1', 'Speaker1', '2Participant']

In [9]:
the_rest = [
    token
    for item in potential_spelling_issues
    if (token := item["broken_label"]) not in 
    weird_number_in_front + number_issues
]
the_rest

['[Participant']

In [10]:
[item
 for item in potential_spelling_issues
 if item["broken_label"] == '[Participant']

[{'broken_label': '[Participant',
  'close_to': ['Participant'],
  'doc': Document(040_487.txt, s1036-42_s2008-9_s3000-15),
  'row_num': 26,
  'prev_line': ['Wanted to put it right handy in a little container.'],
  'line': 'I wanted to put it up with the salt and the oatmeal, and my husband said [inaudible 00:45:33] and he harrumphed and " [Participant 040 00:22:36]?" And I looked at what I had and what I was doing and promptly put it where it was supposed to be.'}]

In [11]:
# don't care about the last one
the_rest = the_rest[:-1]
the_rest # only misspellings left

[]

In [19]:
animals = ["dog", "cat", "hippo"]
animals.insert(1, "rhinocerous")
animals

['dog', 'rhinocerous', 'cat', 'hippo']

All this led to `fix_misspelled_labels.py`.

# Fix spacing issues

### Sub-issue: labels followed improperly by timestamps

In [12]:
label_timestamp = re.compile(r'(?:{ss})(?:{ts})'
                             .format(ss='|'.join(rxs.SPEAKERS), 
                                     ts=rxs.timestamps.pattern))
label_timestamp.findall(all_doc_content_lol)
# Seems like this is the only instance of this issue

['Interviewer19:09']

## Actual Issue

With misspellings fixed, we now assume that all of our speaker labels are accurate. We now tend to the issue of improper label formatting, like `"Label :"` or `"Label."`

In [13]:
[(issues, 
  [line for line in doc.lines if issues["bad_punctuation"][0] in line],
  doc)
 for doc in data.by_doc
 if (issues := rxs.find_speaker_format_issues(doc.full_content))
 and not issues["bad_punctuation"][0].endswith(tuple(string.punctuation + 's'))]

[({'bad_punctuation': ['Speaker1']},
  ['Speaker1 – To me just from being in therapy so much all the time with the children, I think of holding hands or a hand reaching out to grab another hand because it’s like your giving a tool, a way of lie, a way to combat something that can be overwhelming, so somebody’s helping you, helping hand to lift you up and get you to that next step.'],
  Document(3001_039.txt, s1046-50_s2012-13_s3026-50)),
 ({'bad_punctuation': ['Interviewer1']},
  ['Interviewer19:09- Ok sounds good. So, you’ve Mentioned that your partner keeps your behavior in check. But how do your other friends and family feel about your behavior?'],
  Document(059_718.txt, s1059-60_s2020-21)),
 ({'bad_punctuation': ['Speaker1']},
  ['Speaker1: Umm'],
  Document(3001_011.txt, s1036-42_s2008-9_s3000-15)),
 ({'bad_punctuation': ['Speaker1']},
  ['Speaker1: Yeah that’s like you know… like it’s a lifestyle change.'],
  Document(3001_012.txt, s1036-42_s2008-9_s3000-15))]

Don't forget about broken timestamps

In [14]:
# I have no idea what the hell is wrong with this transcript. I think 
# there may be more misnamed documents.
from utils.transcript import Transcript

doc = Transcript("059")["718"]
bad_line_index = [i for i in range(len(doc.lines)) 
            if "Interviewer19:09-" in doc.lines[i]][0]
doc.tokens

[['17:33', 'Interviewer:'],
 ['And',
  'have',
  'you',
  'ever',
  'sought',
  'to',
  'change',
  'this',
  'behavior',
  'like'],
 ['Participant:'],
 ['So',
  'I',
  'was',
  'gonna',
  'say',
  'um',
  'just',
  'he',
  'decreases,',
  'my',
  'opportunity',
  'for',
  'hoarding.'],
 ['17:47', 'Interviewer:'],
 ['Okay,', 'so', 'he', 'decreases', 'your', 'opportunity.'],
 ['Participant:'],
 ['My', 'opportunity.', 'Yeah'],
 ['17:54', 'Interviewer:'],
 ['And', 'have', 'you', 'ever', 'sought', 'to', 'change', 'this', 'behavior?'],
 ['Participant:'],
 ['Yes,',
  'yes',
  'I',
  'do.',
  'I',
  'have',
  'a',
  'like',
  'a',
  'trick',
  'that',
  'I',
  'do,',
  'which',
  'is',
  'To',
  'get',
  'rid',
  'of',
  'five',
  'things.',
  'So,',
  'I',
  'try',
  'to',
  'on',
  'the',
  'weekends',
  'or',
  'on',
  'other',
  'days',
  'when',
  "there's",
  'like',
  'more',
  'time.',
  'I',
  'work',
  'sometimes.',
  'I',
  'will',
  'challenge',
  'myself',
  'to',
  'get',
  'rid

In [15]:
# Wow, there really are no spacing issues? I could have sworn that I saw some.
re.findall(r'[\w:]+\s+:', all_doc_content_lol)

[]

In [16]:
print(all_doc_content_lol)

Interviewer: So that brings us nicely into the next section of the interview which focuses on the question of insight.
And so, as we’ve talked about a lot of the literature surrounding hoarding disorder describes many hoarding patients as having low insight, and so the first question to start out with is how would you as a clinician define insight?
Interviewee: I mean think that I often times define it in impairment as well.
It’s like, are they able to recognize the level of impairment that is coming from behaviors that they’re currently engaging in?
And if they’re not able to—let’s say it’s causing significant conflict with a spouse, they are pretty much sleeping on the floor somewhere randomly in their home and they have to wash their face at work or something like that, brush their teeth, then I would say not understanding or recognizing that these behaviors are to some degree not normal is how I kind of think about the patient’s ability to understand he repercussions of what’s happ