In [1]:
#🧠 Concept:
#Build a script that:

#Takes in messy .txt files (e.g. meeting notes, email logs, raw transcripts)
#Cleans them up by:
    #Stripping extra blank lines
    #Removing excess whitespace
    #Capitalizing proper sentences
#Saves the cleaned version into a new .txt file

In [2]:
# Part 1: File Handling in Jupyter
def clean_text_file(input_path, output_path):
    with open(input_path, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if stripped == '':
            continue # skip empty lines
        cleaned = stripped.capitalize() # or .title()
        cleaned_lines.append(cleaned)

    with open(output_path, 'w') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

# Example use:
clean_text_file('input.txt', 'cleaned_report.txt')

In [3]:
# 🧼 Part 2: Full Project – Text Report Cleaner + Regex + Logging

In [4]:
import re
import datetime
import os

In [5]:
# User Options + Setup

# Choose input/output settings
input_file = 'input.txt'
output_file = 'cleaned_report.txt'
log_file = 'cleaned_log.txt'

# Options for formatting
USE_TITLE_CASE = True  # Change to False for capitalize instead
REMOVE_DUPLICATE_LINES = True

In [6]:
# Main Cleaner Function with Regex and Logging

def clean_text_file(input_path, output_path, log_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    seen_lines = set()
    log = []

    for line in lines:
        original = line.rstrip('\n')

        # 1. Strip leading/trailing whitespace
        stripped = original.strip()

        if stripped == '':
            continue  # Skip empty lines

        # 2. Optional: Remove duplicate lines
        if REMOVE_DUPLICATE_LINES and stripped.lower() in seen_lines:
            log.append(f"[DUPLICATE] Skipped line: {stripped}")
            continue
        seen_lines.add(stripped.lower())

        # 3. Fix case
        cleaned = stripped.title() if USE_TITLE_CASE else stripped.capitalize()

        # 4. Regex cleanup
        cleaned = re.sub(r'\s+', ' ', cleaned)  # Replace multiple spaces with single space
        cleaned = re.sub(r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\b', r'\1/\2/\3', cleaned)  # Fix date formats
        cleaned = re.sub(r'[ ]{2,}', ' ', cleaned)  # Extra spaces
        cleaned = re.sub(r'(?i)\bemail\b:?\s*[\w\.-]+@[\w\.-]+', '[EMAIL REDACTED]', cleaned)
        cleaned = re.sub(r'\b\d{3}[-\s]?\d{3}[-\s]?\d{4}\b', '[PHONE REDACTED]', cleaned)

        cleaned_lines.append(cleaned)
        log.append(f"[OK] {cleaned}")

    # Write cleaned text
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

    # Write log
    with open(log_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"--- Cleaning session at {datetime.datetime.now()} ---\n")
        for entry in log:
            log_file.write(entry + '\n')
        log_file.write('\n')

    print(f"✅ Done! Cleaned {len(cleaned_lines)} lines. Log saved to {log_path}")


In [7]:
clean_text_file(input_file, output_file, log_file)

✅ Done! Cleaned 4 lines. Log saved to cleaned_log.txt


In [8]:
# Example 1: Resume Parser & Formatter

# 🔧 Task:
# Clean and standardize a batch of raw resumes stored in .txt format by:
## Removing extra spacing and blank lines
## Extracting and redacting emails and phone numbers
## Normalizing case and formatting headers (like "Education", "Skills", etc.)

# 💡 Learn:
## Use regex to detect headers like Skills: or EDUCATION
## Group multiple regex patterns (emails, phones, LinkedIn URLs)
## Apply title case to section headings but sentence case to bullet points

In [10]:
# Choose input/output settings
input_file = 'input_example1.txt'
output_file = 'cleaned_report_example1.txt'
log_file = 'cleaned_log_example1.txt'

# Options for formatting
USE_TITLE_CASE = True  # Change to False for capitalize instead
REMOVE_DUPLICATE_LINES = True

In [14]:
import re

def clean_resume(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # Normalize section headers like "skills:", "education:"
        stripped = re.sub(r'^(name|email|phone|education|skills)\s*:', 
                          lambda m: m.group(1).capitalize() + ':', 
                          stripped, flags=re.IGNORECASE) # Q: Ask about the regex here.

        # Fix capitalization (title case unless it's an email)
        if '@' not in stripped:
            stripped = stripped.title()

        # Redact emails/phones
        stripped = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL REDACTED]', stripped)
        stripped = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '[PHONE REDACTED]', stripped)

        cleaned_lines.append(stripped)

    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

    print(f"✅ Resume cleaned and saved to {output_path}")


In [15]:
clean_resume(input_file, output_file)

✅ Resume cleaned and saved to cleaned_report_example1.txt


In [16]:
# Example 2: Meeting Transcript Highlighter

In [17]:
sample_transcript = """
alice: um yeah I think we should, you know, postpone the launch.
bob: okay, like let's move forward with the new plan then.
carol: uh I'm not sure, you know, if that's the best idea.
alice: we could also ask [CONFIDENTIAL] for feedback.
bob: yeah, email: bob.k@agency.com has more info.
carol: call me at 555-123-4567 if needed.
"""

# Save to file
with open('meeting_example.txt', 'w', encoding='utf-8') as f:
    f.write(sample_transcript.strip())

print("✅ meeting_example.txt created.")

✅ meeting_example.txt created.


In [20]:
def clean_transcript(input_path, output_path):
    with open(input_path, 'r', encoding = 'utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if not stripped: # Q: What does this part do here?
            continue

        # Remove filler words
        stripped = re.sub(r'\b(um|you know|like|uh)\b', '', stripped, flags = re.IGNORECASE)

        # Bold speaker names
        stripped = re.sub(r'^([a-z]+):', lambda m: '**'+ m.group(1).upper() + '**', stripped, flags=re.IGNORECASE)

        # Redact confidential blocks
        if '[CONFIDENTIAL]' in stripped.upper():
            cleaned_lines.append('[REDACTED]')
        else:
            cleaned_lines.append(stripped)

    with open(output_path, 'w', encoding = 'utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

    print(f"✅ Transcript cleaned and saved to {output_path}")

In [21]:
clean_transcript('meeting_example.txt', 'meeting_cleaned.txt')

✅ Transcript cleaned and saved to meeting_cleaned.txt


In [22]:
# Example 3 CAT model risk note cleaner

In [23]:
sample_cat_report = """
--- Page 1 of 3 ---

Event: hurricane isabel
Location: Outer Banks, NC
Date: 9-18-2003
Damage Estimate: $3.7B

Remarks: widespread flooding. Thousands displaced. Poor drainage worsened flooding.

Contact: claims-response@reinsurance.net

--- Confidential Notes ---

Backup contact: sandra_lee@agency.org
storm report received on 09/18/03
site visited 09-19-2003
manual override: $3,700,000,000
page: internal_use_only

--- Page 2 of 3 ---

Event: tornado outbreak
Location: Tuscaloosa, AL
Date: 4-27-2011
Damage Estimate: $2.2B

Eyewitness: “it ripped through the whole town...”
Contact: b.tucker@tornadodata.com

raw notes: property losses around 650 million.
raw notes: fatalities reported ~250.
"""

# Save it as a text file
with open('cat_report_example.txt', 'w', encoding='utf-8') as f:
    f.write(sample_cat_report.strip())

print("✅ cat_report_example.txt created.")

✅ cat_report_example.txt created.


In [25]:
import datetime

def clean_cat_report(input_path, output_path, log_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    log = []

    for line in lines:
        original = line.strip()
        if not original or '---' in original or 'Page' in original:
            continue  # Skip headers and empty lines

        cleaned = original

        # Fix date format to MM/DD/YYYY
        cleaned = re.sub(r'\b(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})\b', r'\1/\2/\3', cleaned)

        # Capitalize storm names and headings
        cleaned = re.sub(r'^(event|location|date|damage estimate|remarks)\s*:', 
                         lambda m: m.group(1).capitalize() + ':', 
                         cleaned, flags=re.IGNORECASE)

        # Redact emails
        cleaned = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL REDACTED]', cleaned)

        # Add to cleaned lines and log
        cleaned_lines.append(cleaned)
        log.append(f"[OK] {cleaned}")

    # Save cleaned report
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

    # Save log
    with open(log_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"--- CAT Cleaning Session {datetime.datetime.now()} ---\n")
        for entry in log:
            log_file.write(entry + '\n')
        log_file.write('\n')

    print(f"✅ CAT report cleaned and saved to {output_path}. Log saved to {log_path}")


In [26]:
clean_cat_report('cat_report_example.txt', 'cat_report_cleaned.txt', 'cat_report_log.txt')


✅ CAT report cleaned and saved to cat_report_cleaned.txt. Log saved to cat_report_log.txt
