In [None]:
import re
import json 
from difflib import get_close_matches
import sys 
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils.regexes import SPEAKER_PAIRS, SPEAKERS, speaker_labels, speaker_labels_restricted

import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,ner')


class ErrorDetector:

    ''' 
    A class for catching anything not cleaned out in initial data processing.

        Method 1. Proper noun (name) detection + replacement using stanza.

        Method 2. Speaker label format detection + replacement.

        Method 3. Speaker label spelling error detection + replacement.

    '''

    def __init__(self, text, nlp):
        self.text = text
        self.doc = nlp(text)

    def detect_proper_nouns(self):
        '''
        Method 1.1 Detects proper Nouns (names) for replacement.
        '''
        print("Named Entities (NER):")
        for ent in self.doc.ents:
            print(f"{ent.text:<25} → {ent.type}")

        # Title + Name extraction using POS tagging
        print("\nTitle + Name Pairs (POS-based):")
        title_words = {'Dr.', 'Mr.', 'Ms.', 'Mrs.', 'Prof.', 'Professor'}
        for sentence in self.doc.sentences:
            words = sentence.words
            for i in range(len(words) - 1):
                curr_word = words[i]
                next_word = words[i + 1]
                if curr_word.text in title_words and next_word.upos == 'PROPN':
                    title_name = f"{curr_word.text} {next_word.text}"
                    print(title_name)

    def replace_names(self):
        ''' 
        Method 1.2 Replaces PERSON entities with NAME
        '''
        masked_text = self.text
        offset = 0  # Tracks character shifts due to replacement length differences
        was_modified = False

        for ent in self.doc.ents:
            if ent.type == 'PERSON':
                start = ent.start_char + offset
                end = ent.end_char + offset
                original = masked_text[start:end]
                masked_text = masked_text[:start] + "NAME" + masked_text[end:]
                offset += len("NAME") - len(original)
                was_modified = True
        
        if was_modified == True:
            print("ERRORS FOUND")
        else:
            print("NO ERRORS")
        
        return masked_text
        print(masked_text[:1000])  # Print first 1000 characters for preview




def process_transcripts(input_folder, output_folder, nlp):
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)

    for file in input_path.glob("*.txt.json"):
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        ''' # Get full text
        text = data.get('full_content', '')
        if not text:
            print(f"Skipping {file.name}: No full_content key found.")
            continue
        '''

        # Step 1: Build full content from rows
        rows = data.get("rows", [])
        if not rows:
            print(f"Skipping {file.name}: No 'rows' found.")
            continue

        # Join all content fields to make the full text
        text = "\n".join(row["content"] for row in rows if "content" in row)

        
        detector = ErrorDetector(text, nlp)
        detector.detect_proper_nouns()
        cleaned_text = detector.replace_names()

        # SHow preview
        print(f"\n--- {file.name} (First 500 characters) ---")
        print(cleaned_text[:500])
        print("-" * 40)

        # Save cleaned output
        cleaned_file = output_path / file.name.replace(".txt.json", "_cleaned.json")
        with open(cleaned_file, 'w', encoding='utf-8') as out_f:
            #out_f.write(cleaned_text) leftover from txt file version
            json.dump(data, out_f, indent=2, ensure_ascii=False)



if __name__ == "__main__":
    project_root = Path.cwd().parent

    input_folder = project_root / "data" / "s1055-1058" / "REVIEW"
    output_folder = project_root / "data" / "s1055-1058" / "CLEANED_TR"

    process_transcripts(
        input_folder=input_folder,
        output_folder=output_folder,
     nlp=nlp
    )


'''
   def find_speaker_format_issues(text, speaker_set):
        """
       Method 2.1 Detects speaker label formatting issues:
            1. Speaker label followed by space before colon (e.g., 'Participant :')
            2. Speaker label followed by punctuation or character other than ':' (e.g., 'Participant.', 'Participant-')
        Returns a dictionary with issue types and matching instances.
        """
        issues = {}

        # Pattern 1: Space before colon → "Participant :" (colon spacing issue)
        spacing_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(s) for s in speaker_set) + r')\s+:')
        spacing_matches = spacing_pattern.findall(text)
        if spacing_matches:
            issues['spacing_issue'] = spacing_matches

        # Pattern 2: Speaker label followed by something other than colon or space (e.g., 'Participant.' or 'Participant!')
        bad_punct_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(s) for s in speaker_set) + r')[^\s:]')
        punct_matches = bad_punct_pattern.findall(text)
        if punct_matches:
            issues['bad_punctuation'] = punct_matches

        return issues

      def find_spelling_variants(self, threshold=0.8):
        """
        Method 3.1 Finds likely misspellings of speaker labels using fuzzy matching.
        """
        # Matches lines that look like 'Speaker: some text'
        pattern = re.compile(r'^([A-Z][a-zA-Z0-9_ ]{1,30})(?=\s*:\s*)', re.MULTILINE)
        candidates = pattern.findall(self.text)

        fuzzy_hits = {}
        for cand in candidates:
            matches = get_close_matches(cand, self.speaker_set, n=1, cutoff=threshold)
            if matches and matches[0] != cand:
                fuzzy_hits[cand] = matches[0]
        return fuzzy_hits

    def find_multi_speaker_lines(self):
        '''
        #Finds lines that contain more than one speaker label.
'''
        speaker_pattern = r'\b(?:' + '|'.join(re.escape(s) for s in SPEAKERS) + r')\s*:'
        pattern = re.compile(speaker_pattern)
        multi_speaker_lines = []
        for i, line in enumerate(text.splitlines()):
            matches = pattern.findall(line)
            if len(matches) > 1:
                multi_speaker_lines.append((i + 1, line.strip(), matches))
        return multi_speaker_lines


''' #Add fixes and edit regexes''' 

2025-07-24 17:13:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 433kB [00:00, 23.3MB/s]                    
2025-07-24 17:13:06 INFO: Downloaded file to /Users/admin/stanza_resources/resources.json
2025-07-24 17:13:06 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| ner       | ontonotes-ww-multi_charlm |

2025-07-24 17:13:06 INFO: Using device: cpu
2025-07-24 17:13:06 INFO: Loading: tokenize
2025-07-24 17:13:06 INFO: Loading: mwt
2025-07-24 17:13:06 INFO: Loading: pos
2025-07-24 17

Named Entities (NER):
Christmas                 → DATE
about three feet          → QUANTITY
1                         → CARDINAL
Mm                        → PERSON
058                       → CARDINAL
22:41                     → TIME
10                        → CARDINAL
24:04                     → TIME
24:19                     → TIME
TLC                       → ORG
058                       → CARDINAL

Title + Name Pairs (POS-based):
ERRORS FOUND

--- 058_712.txt.json (First 500 characters) ---
20:43 Interviewer:
And  What would happen if you lost your Christmas ornaments? Like, is that something that you collect? Or do you have like a quote normal amount of them?
Participant:
I think it's a normal amount.
20:59 Interviewer:
Okay. Um, so what about, like all of your newspapers and papers. What happened, what would happen if you lost them?
Participant:
the scary part is I wouldn't even remember it.
21:48 Interviewer:
Um, okay. So, you don't think that you would be affected that much?
P

"\n        speaker_pattern = r'\x08(?:' + '|'.join(re.escape(s) for s in SPEAKERS) + r')\\s*:'\n        pattern = re.compile(speaker_pattern)\n        multi_speaker_lines = []\n        for i, line in enumerate(text.splitlines()):\n            matches = pattern.findall(line)\n            if len(matches) > 1:\n                multi_speaker_lines.append((i + 1, line.strip(), matches))\n        return multi_speaker_lines\n\n\n"