# Convert HTML / JavaScript files to JSON -- Led to a lot of JSON formating errors.

In [None]:
import os
import json
import re
import csv

def extract_title_and_body_from_js(file_path):
    """Extracts the title and body text from a .js file with a defined data structure."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        # Use regex to find the title field
        title_match = re.search(r'"title"\s*:\s*"([^"]+)"', content)
        title = title_match.group(1) if title_match else "No Title Found"
        
        # Use regex to find the body field (may include HTML content)
        body_match = re.search(r'"body"\s*:\s*"([^"]+)"', content, re.DOTALL)
        body = body_match.group(1) if body_match else "No Body Found"
        
        # Optional: Decode HTML entities, if required (e.g., &amp; to &)
        body = body.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r").replace('\\"', '"')
        
        return title, body

def process_directory(directory):
    """Processes all .js files in the specified directory and extracts the title and body."""
    data = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.js'):
                file_path = os.path.join(root, file)
                title, body = extract_title_and_body_from_js(file_path)
                data.append({"file_name": file, "title": title, "body": body})
    return data

def save_to_json(data, output_file):
    """Saves the extracted data to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

def save_to_csv(data, output_file):
    """Saves the extracted data to a CSV file."""
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["file_name", "title", "body"])
        writer.writeheader()
        writer.writerows(data)

# Directory containing the .js files
input_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD all 2018\UTDall\d\topics'

# Process the .js files
extracted_data = process_directory(input_directory)

# Save the extracted data to JSON and CSV
save_to_json(extracted_data, 'extracted_data.json')
save_to_csv(extracted_data, 'extracted_data.csv')

print("Data extraction complete. Saved to 'extracted_data.json' and 'extracted_data.csv'.")


# Troubleshooting using an html parser:

In [6]:
import re
from bs4 import BeautifulSoup

def extract_and_preserve_formatting(file_path):
    """Extracts and formats 'body' content while preserving paragraph structures and excluding specific sections."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        # Extract the body content
        body_match = re.search(r'"body"\s*:\s*"(.*?)",\s*"outline"', content, re.DOTALL)
        if not body_match:
            return "Body content not found."
        
        # Extract raw body content
        body_content = body_match.group(1)
        
        # Unescape escaped sequences (e.g., \n, \t, etc.)
        body_content = body_content.encode().decode('unicode_escape')
        
        # Remove HTML tags and preserve structure using BeautifulSoup
        soup = BeautifulSoup(body_content, 'html.parser')

        # Convert content to text while preserving paragraph tags, lists, etc.
        formatted_text = []
        for element in soup.recursiveChildGenerator():
            if element.name == 'p':
                formatted_text.append("\n" + element.get_text().strip() + "\n")
            elif element.name in ['li', 'span', 'div']:
                formatted_text.append(element.get_text().strip())
            elif element.name == 'br':
                formatted_text.append("\n")
            elif element.string:
                # Append text nodes directly
                text = element.string.strip()
                if text:
                    formatted_text.append(text)

        # Join formatted text with appropriate spacing
        formatted_output = '\n'.join([line for line in formatted_text if line.strip()])

        # Optional: further cleanup and filtering
        formatted_output = re.sub(r'\n\s*\n', '\n\n', formatted_output)  # Collapse multiple newlines into two
        formatted_output = re.sub(r'[ \t]+', ' ', formatted_output)  # Replace multiple spaces with a single space
        formatted_output = formatted_output.strip()

        # Remove the specified range of text
        exclusion_start = "Copyright 1978-2018 Lexicomp, Inc. All rights reserved."
        exclusion_end = "ALERT: US Boxed Warning"
        exclusion_pattern = re.escape(exclusion_start) + r".*?" + re.escape(exclusion_end)
        formatted_output = re.sub(exclusion_pattern, '', formatted_output, flags=re.DOTALL).strip()

        return formatted_output

# Specify the path to the test .js file
test_file_path = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD all 2018\UTDall\d\topics\101.js'

# Extract and format the body text while excluding specified content
formatted_body_text = extract_and_preserve_formatting(test_file_path)

# Print the formatted plain text for verification
print(formatted_body_text[:1000])  # Display a portion for quick verification


Acetaminophen (paracetamol), aspirin, and caffeine: Drug informationAccess Lexicomp Online here.Copyright 1978-2018 Lexicomp, Inc. All rights reserved.(For additional information see "Acetaminophen (paracetamol), aspirin, and caffeine: Patient drug information")For abbreviations and symbols that may be used in Lexicomp (show table)Brand Names: US

Anacin Advanced Headache Formula [OTC];
Excedrin Extra Strength [OTC];
Excedrin Migraine [OTC];
Fem-Prin [OTC];
Goody's Extra Strength Headache Powder [OTC];
Goody's Extra Strength Pain Relief [OTC] [DSC];
Pain-Off [OTC];
Vanquish Extra Strength Pain Reliever [OTC]
Pharmacologic Category

 Analgesic, Nonopioid
Dosing: Adult

Minor aches and pain: Oral: 
Acetaminophen 194 mg/aspirin 227 mg/caffeine 33 mg: Two tablets every 6 hours as needed (maximum: 8 tablets per 24 hours) 
Acetaminophen 250 mg/aspirin 250 mg/caffeine 65 mg: Two tablets every 6 hours as needed (maximum: 8 tablets per 24 hours)
Acetaminophen 260 mg/aspirin 520 mg/caffeine 32.5

In [13]:
import re
from bs4 import BeautifulSoup

def extract_and_preserve_formatting(file_path):
    """Extracts and formats 'body' content while preserving paragraph structures and excluding specific sections."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        # Extract the body content
        body_match = re.search(r'"body"\s*:\s*"(.*?)",\s*"outline"', content, re.DOTALL)
        if not body_match:
            return "Body content not found."
        
        # Extract raw body content
        body_content = body_match.group(1)
        
        # Unescape escaped sequences (e.g., \n, \t, etc.)
        body_content = body_content.encode().decode('unicode_escape')
        
        # Remove HTML tags and preserve structure using BeautifulSoup
        soup = BeautifulSoup(body_content, 'html.parser')

        # Convert content to text while preserving paragraph tags, lists, etc.
        formatted_text = []
        for element in soup.recursiveChildGenerator():
            if element.name == 'p':
                formatted_text.append("\n" + element.get_text().strip() + "\n")
            elif element.name in ['li', 'span', 'div']:
                formatted_text.append(element.get_text().strip())
            elif element.name == 'br':
                formatted_text.append("\n")
            elif element.string:
                # Append text nodes directly
                text = element.string.strip()
                if text:
                    formatted_text.append(text)

        # Join formatted text with appropriate spacing
        formatted_output = '\n'.join([line for line in formatted_text if line.strip()])

        # Optional: further cleanup and filtering
        formatted_output = re.sub(r'\n\s*\n', '\n\n', formatted_output)  # Collapse multiple newlines into two
        formatted_output = re.sub(r'[ \t]+', ' ', formatted_output)  # Replace multiple spaces with a single space
        formatted_output = formatted_output.strip()

        # Remove everything after 'Disclaimer:' and 'REFERENCES:'
        disclaimer_pattern = re.compile(r'Disclaimer:.*', re.DOTALL)
        references_pattern = re.compile(r'REFERENCES:.*', re.DOTALL)
        formatted_output = re.sub(disclaimer_pattern, '', formatted_output).strip()
        formatted_output = re.sub(references_pattern, '', formatted_output).strip()

        return formatted_output

# Specify the path to the test .js file
test_file_path = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD all 2018\UTDall\d\topics\101.js'

# Extract and format the body text while excluding specified content
formatted_body_text = extract_and_preserve_formatting(test_file_path)

# Print the formatted plain text for verification
print(formatted_body_text[:1000000])  # Display a portion for quick verification


Acetaminophen (paracetamol), aspirin, and caffeine: Drug informationAccess Lexicomp Online here.Copyright 1978-2018 Lexicomp, Inc. All rights reserved.(For additional information see "Acetaminophen (paracetamol), aspirin, and caffeine: Patient drug information")For abbreviations and symbols that may be used in Lexicomp (show table)Brand Names: US

Anacin Advanced Headache Formula [OTC];
Excedrin Extra Strength [OTC];
Excedrin Migraine [OTC];
Fem-Prin [OTC];
Goody's Extra Strength Headache Powder [OTC];
Goody's Extra Strength Pain Relief [OTC] [DSC];
Pain-Off [OTC];
Vanquish Extra Strength Pain Reliever [OTC]
Pharmacologic Category

 Analgesic, Nonopioid
Dosing: Adult

Minor aches and pain: Oral: 
Acetaminophen 194 mg/aspirin 227 mg/caffeine 33 mg: Two tablets every 6 hours as needed (maximum: 8 tablets per 24 hours) 
Acetaminophen 250 mg/aspirin 250 mg/caffeine 65 mg: Two tablets every 6 hours as needed (maximum: 8 tablets per 24 hours)
Acetaminophen 260 mg/aspirin 520 mg/caffeine 32.5

# HTML / JavaScript files to TXT File: Best approach so far. All code from here and below is used to convert HTML to a cleaned txt file

In [3]:
import os
import re
from bs4 import BeautifulSoup

def extract_and_preserve_formatting(file_path):
    """Extracts and formats 'body' content while preserving paragraph structures and excluding specific sections."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        # Extract the body content
        body_match = re.search(r'"body"\s*:\s*"(.*?)",\s*"outline"', content, re.DOTALL)
        if not body_match:
            return "Body content not found."
        
        # Extract raw body content
        body_content = body_match.group(1)
        
        # Unescape escaped sequences (e.g., \n, \t, etc.)
        body_content = body_content.encode().decode('unicode_escape')
        
        # Remove HTML tags and preserve structure using BeautifulSoup
        soup = BeautifulSoup(body_content, 'html.parser')

        # Convert content to text while preserving paragraph tags, lists, etc.
        formatted_text = []
        for element in soup.recursiveChildGenerator():
            if element.name == 'p':
                formatted_text.append("\n" + element.get_text().strip() + "\n")
            elif element.name in ['li', 'span', 'div']:
                formatted_text.append(element.get_text().strip())
            elif element.name == 'br':
                formatted_text.append("\n")
            elif element.string:
                # Append text nodes directly
                text = element.string.strip()
                if text:
                    formatted_text.append(text)

        # Join formatted text with appropriate spacing
        formatted_output = '\n'.join([line for line in formatted_text if line.strip()])

        # Optional: further cleanup and filtering
        formatted_output = re.sub(r'\n\s*\n', '\n\n', formatted_output)  # Collapse multiple newlines into two
        formatted_output = re.sub(r'[ \t]+', ' ', formatted_output)  # Replace multiple spaces with a single space
        formatted_output = formatted_output.strip()

        # Remove everything after 'Disclaimer:' and 'REFERENCES:'
        disclaimer_pattern = re.compile(r'Disclaimer:.*', re.DOTALL)
        references_pattern = re.compile(r'REFERENCES.*', re.DOTALL)
        formatted_output = re.sub(disclaimer_pattern, '', formatted_output).strip()
        formatted_output = re.sub(references_pattern, '', formatted_output).strip()

        return formatted_output

def process_all_js_files(input_directory, output_directory):
    """Processes all .js files in the input directory and writes formatted content to .txt files in the output directory."""
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for root, _, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith('.js'):
                file_path = os.path.join(root, file_name)
                formatted_text = extract_and_preserve_formatting(file_path)
                
                # Create a corresponding .txt file in the output directory
                output_file_name = os.path.splitext(file_name)[0] + '.txt'
                output_file_path = os.path.join(output_directory, output_file_name)
                
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(formatted_text)
                    # print("Formatted file:" + output_file_path)

# Specify input and output directories
input_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD all 2018\UTDall\d\topics'
output_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .js files and write formatted content to .txt files
process_all_js_files(input_directory, output_directory)

print("Processing complete. Formatted files are saved in the output directory.")


Processing complete. Formatted files are saved in the output directory.


In [4]:
import os
import re

def clean_text_file(file_path):
    """Cleans the text file based on specified rules."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Remove specific phrases, accounting for possible leading characters
    content = re.sub(r'.*?Use of UpToDate is subject to the Subscription and License Agreement\.', '', content)

    # Remove text between "Disclosures" and "." before "INTRODUCTION" while keeping "INTRODUCTION"
    content = re.sub(r'Disclosures.*?\.\s*(INTRODUCTION)', r'\1', content, flags=re.DOTALL)

    # Remove "urn:lims:" patterns with variable alphanumerics
    content = re.sub(r'urn:lims:[a-zA-Z0-9:]*', '', content)

    # Correct line breaks following bullets "•", "●", or "—"
    content = re.sub(r'([•●—])\s*\n\s*', r'\1 ', content)

    # Remove "figure #" or "table #"
    content = re.sub(r'\b(?:figure|table) [0-9a-zA-Z]+\b', '', content, flags=re.IGNORECASE)

    # Remove duplicate lines
    lines = content.splitlines()
    cleaned_lines = []
    seen_lines = set()
    for line in lines:
        if line not in seen_lines:
            cleaned_lines.append(line)
            seen_lines.add(line)
    content = '\n'.join(cleaned_lines)

    # Remove specific text blocks
    content = re.sub(r'All topics are updated as new evidence.*?©2018 UpToDate, Inc\. All rights reserved\.', '', content, flags=re.DOTALL)
    content = re.sub(r'Use of UpToDate is subject to the.*?Version \d+\.\d+', '', content, flags=re.DOTALL)

    # Check word count and delete file if less than 1500 words
    word_count = len(content.split())
    if word_count < 1500:
        os.remove(file_path)
        return f"{file_path} deleted (less than 1500 words)."

    # Write cleaned content back to file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    return f"{file_path} cleaned."

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                result = clean_text_file(file_path)
                print(result)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)


C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\0.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10000.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10002.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10003.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10004.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10005.txt deleted (less than 1500 words).
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2

# Fine-grained, specific cleaning

In [5]:
import os
import re

def normalize_line_for_comparison(line):
    """Normalizes a line by removing bullets, whitespace, and handling line breaks for comparison."""
    # Remove leading bullets (single or double), collapse whitespace, and remove newlines within the line
    line = re.sub(r'^[•●\s]*', '', line)  # Remove bullets and leading whitespace
    line = re.sub(r'\s+', ' ', line)      # Collapse multiple spaces into one
    line = line.replace('\n', '')         # Remove any newlines for consistent comparison
    return line.lower().strip()           # Convert to lowercase for case-insensitive matching

def remove_sequential_duplicates(content):
    """Removes complex sequential duplicate lines from the content."""
    lines = content.splitlines()
    unique_lines = []
    seen_lines = set()

    previous_normalized_line = ""
    i = 0
    while i < len(lines):
        line = lines[i]
        normalized_line = normalize_line_for_comparison(line)

        # Check next line for complex duplication handling
        if i < len(lines) - 1:
            next_line_combined = normalize_line_for_comparison(line + lines[i + 1])
            next_line_normalized = normalize_line_for_comparison(lines[i + 1])

            # If current and next line combined equals the normalized next line, treat as duplicate
            if next_line_combined == next_line_normalized:
                i += 1  # Skip the next line since it's part of the duplication
                continue

        if normalized_line != previous_normalized_line:
            unique_lines.append(line)
            seen_lines.add(normalized_line)
        previous_normalized_line = normalized_line
        i += 1

    # Join lines back into the content
    return "\n".join(unique_lines)

def clean_text_file(file_path):
    """Cleans the text file based on specified rules."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Handle specific removal cases first
    if any(substring in content for substring in [
        'Use of UpToDate', 'All topics are updated', 'ACKNOWLEDGMENT',
        '• • US', 'If you think there has been an overdose'
    ]):
        # Remove specified patterns
        content = re.sub(r'.*?Use of UpToDate is subject to the Subscription and License Agreement\.', '', content, flags=re.DOTALL)
        content = re.sub(r'All topics are updated as new evidence.*?©2018 UpToDate, Inc\. All rights reserved\.', '', content, flags=re.DOTALL)
        content = re.sub(r'Use of UpToDate is subject to the.*?Version \d+\.\d+', '', content, flags=re.DOTALL)
        content = re.sub(r'[^ ]?ACKNOWLEDGMENT\s*—\s*.*?\.', '', content, flags=re.DOTALL)
        content = re.sub(r'The use of UpToDate content is governed by the\s*UpToDate Terms of Use\s*\.?\s*©\d{4} UpToDate, Inc\. All rights reserved\.\s*Topic \d+ Version \d+\.\d+', '', content, flags=re.DOTALL)
        content = re.sub(r'• • US\s*• • • If you think there has been an overdose, call your poison control center or get medical care right away\.\s*Be ready to tell or show what was taken, how much, and when it happened\.', '', content, flags=re.DOTALL)
        content = re.sub(r'If you think there has been an overdose, call your poison control center or get medical care right away\.\s*Be ready to tell or show what was taken, how much, and when it happened\.', '', content, flags=re.DOTALL)

    # Replace multiple bullets (including variations with whitespace) with a single bullet
    content = re.sub(r'(?:[•●\-–—]\s*){2,}', '• ', content)

    # Remove complex sequential duplicate lines
    content = remove_sequential_duplicates(content)

    # Write cleaned content back to file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"{file_path} cleaned.")

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                clean_text_file(file_path)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)


C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10036.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10037.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10038.

In [6]:
import os
import re

def normalize_line_for_comparison(line):
    """Normalizes a line by removing bullets, whitespace, and handling line breaks for comparison."""
    # Remove leading bullets (single or double), collapse whitespace, and remove newlines within the line
    line = re.sub(r'^[•●\-–—\s]*', '', line)  # Remove bullets and leading whitespace
    line = re.sub(r'\s+', ' ', line)      # Collapse multiple spaces into one
    line = line.replace('\n', '')         # Remove any newlines for consistent comparison
    return line.lower().strip()           # Convert to lowercase for case-insensitive matching

def remove_sequential_duplicates(content):
    """Removes complex sequential duplicate lines from the content."""
    lines = content.splitlines()
    unique_lines = []
    seen_lines = set()

    previous_normalized_line = ""
    i = 0
    while i < len(lines):
        line = lines[i]
        normalized_line = normalize_line_for_comparison(line)

        # Check next line for complex duplication handling
        if i < len(lines) - 1:
            next_line_combined = normalize_line_for_comparison(line + lines[i + 1])
            next_line_normalized = normalize_line_for_comparison(lines[i + 1])

            # If current and next line combined equals the normalized next line, treat as duplicate
            if next_line_combined == next_line_normalized:
                i += 1  # Skip the next line since it's part of the duplication
                continue

        if normalized_line != previous_normalized_line:
            unique_lines.append(line)
            seen_lines.add(normalized_line)
        previous_normalized_line = normalized_line
        i += 1

    # Join lines back into the content
    return "\n".join(unique_lines)

def clean_text_file(file_path):
    """Cleans the text file based on specified rules."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Replace multiple bullets (including variations with whitespace) with a single bullet
    content = re.sub(r'(?:[•●\-–—]\s*){2,}', '• ', content)

    # Remove complex sequential duplicate lines
    content = remove_sequential_duplicates(content)

    # Write cleaned content back to file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"{file_path} cleaned.")

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                clean_text_file(file_path)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)


C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10036.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10037.txt cleaned.
C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10038.

In [None]:
import os
import re
from difflib import SequenceMatcher

def normalize_line(line):
    """Normalizes lines for comparison by trimming whitespace and removing bullet-like prefixes."""
    line = re.sub(r'^[•●\-\–—\s]*', '', line.strip())  # Remove leading bullets, dashes, and whitespace
    return line

def detect_and_remove_redundant_blocks(lines):
    """Detects and removes redundant blocks of single-line and two-line repetitions."""
    cleaned_lines = []
    i = 0
    while i < len(lines):
        current_line = lines[i].strip()
        
        # Check for a potential two-line redundant block
        if i < len(lines) - 2:
            # Normalize lines for comparison
            first_line = normalize_line(lines[i].strip())
            second_line = normalize_line(lines[i + 1].strip())
            third_line = normalize_line(lines[i + 2].strip())

            # Check if the first line is similar to the combined two-line block
            combined_block = f"{second_line} {third_line}"
            similarity = SequenceMatcher(None, first_line, combined_block).ratio()

            if similarity > 0.8:
                # Skip the two-line block, keep only the single line
                cleaned_lines.append(current_line)
                i += 3  # Skip over the two-line block
                continue

        # If no match, keep the current line
        cleaned_lines.append(current_line)
        i += 1

    return cleaned_lines

def clean_text_file(content):
    """Cleans and deduplicates the text content."""
    lines = content.splitlines()

    # Remove redundant blocks of single-line and two-line repetitions
    cleaned_lines = detect_and_remove_redundant_blocks(lines)

    # Replace multiple bullets (including variations with whitespace) with a single bullet
    cleaned_content = "\n".join(cleaned_lines)
    cleaned_content = re.sub(r'(?:[•●\-–—]\s*){2,}', '• ', cleaned_content)

    return cleaned_content

def process_txt_files_in_directory(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                print(f"Processing file: {file_path}")
                
                # Read file content
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                
                # Clean and deduplicate content
                cleaned_content = clean_text_file(content)
                
                # Write cleaned content back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_content)
                print(f"File processed and cleaned: {file_path}")

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_txt_files_in_directory(txt_directory)


Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\P

In [9]:
import os
import re

def clean_text_file(content):
    """Cleans the text content based on specified rules."""
    # Remove specified phrases (accounting for variations in spaces)
    content = re.sub(
        r'The use of UpToDate content is governed by the\s*UpToDate Terms of Use\s*\.?\s*©\d{4} UpToDate, Inc\. All rights reserved\.\s*Topic \d+ Version \d+\.\d+',
        '', content, flags=re.DOTALL
    )
    content = re.sub(
        r'Use of UpToDate is subject to the\s*Subscription and License Agreement\s*\.',
        '', content, flags=re.DOTALL
    )
    # Additional removal for "Topic 12020 Version 88.0 ..." pattern with multiple variations
    content = re.sub(
        r'Topic \d+ Version \d+\.\d+\s*The use of UpToDate content is governed by the UpToDate Terms of Use\.\s*©\d{4} UpToDate, Inc\. All rights reserved\.\s*The use of UpToDate content is governed by the\s*UpToDate Terms of Use\s*\.?\s*©\d{4} UpToDate, Inc\. All rights reserved\.',
        '',
        content,
        flags=re.DOTALL
    )
    content = re.sub(
        r'The content on the UpToDate website is not intended nor recommended as a substitute for medical advice, diagnosis, or treatment\.\s*Always seek the advice of your own physician or other qualified health care professional regarding any medical questions or conditions\.\s*The use of UpToDate content is governed by the UpToDate Terms of Use\. ©\d{4} UpToDate, Inc\. All rights reserved\.',
        '', content, flags=re.DOTALL
    )

    # Remove "picture #" where # is a number
    content = re.sub(r'picture \d+', '', content, flags=re.IGNORECASE)

    return content

def process_txt_files_in_directory(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                # Check for "Society guideline links:" in the second line and delete the file if present
                lines = content.splitlines()
                if len(lines) > 1 and 'Society guideline links:' in lines[1]:
                    os.remove(file_path)
                    print(f"File deleted due to 'Society guideline links:' on second line: {file_path}")
                    continue

                # Clean the content
                cleaned_content = clean_text_file(content)

                # Write cleaned content back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_content)
                print(f"File processed and cleaned: {file_path}")

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_txt_files_in_directory(txt_directory)


File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.t

In [1]:
import os
import re

def clean_text_file(content):
    """Cleans the text content based on specified rules."""
    # Fix spacing issues in the first line for "Access" and "Author"
    lines = content.splitlines()
    if lines:
        # Insert space if "Access" or "Author" immediately follows the last word in the title
        lines[0] = re.sub(r'(\w)(Access|Author)', r'\1 \2', lines[0])

    # Join lines back to content
    content = "\n".join(lines)

    # Remove specified phrases (accounting for variations in spaces)
    content = re.sub(
        r'Access Lexicomp Online here\.Copyright 1978-2018 Lexicomp, Inc\. All rights reserved\.',
        '', content, flags=re.DOTALL
    )

    return content

def process_txt_files_in_directory(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                # Clean the content
                cleaned_content = clean_text_file(content)

                # Write cleaned content back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_content)
                print(f"File processed and cleaned: {file_path}")

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_txt_files_in_directory(txt_directory)


File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.t

In [3]:
import os
import re

def clean_text_file(content):
    """Cleans the text content based on specified rules."""


    # Remove empty brackets [] or parentheses ()
    content = re.sub(r'\[\]', '', content)  # Remove empty brackets
    content = re.sub(r'\(\)', '', content)  # Remove empty parentheses

    # Remove unmatched opening brackets or parentheses
    content = re.sub(r'\[([^\]]*)$', '', content)  # Remove unmatched '['
    content = re.sub(r'\(([^\)]*)$', '', content)  # Remove unmatched '('
    content = re.sub(r'^[^\[]*\]', '', content)    # Remove unmatched ']'
    content = re.sub(r'^[^\(]*\)', '', content)    # Remove unmatched ')'
    
    # Add space between a closing parenthesis and a following letter if there's no space
    content = re.sub(r'\)(\w)', r') \1', content)

    return content

def process_txt_files_in_directory(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                # Clean the content
                cleaned_content = clean_text_file(content)

                # Write cleaned content back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_content)
                print(f"File processed and cleaned: {file_path}")

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_txt_files_in_directory(txt_directory)


File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.t

In [25]:
import os
import re

def clean_text_file_single(file_path):
    """Cleans the text content of a single file by handling specific newline character cases, parentheses matching, etc."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split content into lines
    lines = content.splitlines()

    # Combine lines based on conditions
    combined_lines = []
    buffer = ""
    for line in lines:
        line = line.strip()

        # Check if the line starts with ., ,, or ) and join with buffer
        if line.startswith((".", ",", ")")):
            buffer += line
            continue

        # Ensure lines beginning with lowercase letters join with the previous line
        if line and line[0].islower():
            buffer += " " + line
            continue

        # Check if the line starts with "(" immediately followed by an alphanumeric character
        if line.startswith("(") and len(line) > 1 and line[1].isalnum():
            buffer += " " + line
            continue

        # Process buffer if exists before moving to next line
        if buffer:
            combined_lines.append(buffer)
        buffer = line  # Set current line as new buffer

    # Add any remaining buffer content
    if buffer:
        combined_lines.append(buffer)

    # Check for unmatched parentheses and remove them
    def balance_parentheses(line):
        open_parens = 0
        balanced_line = []
        for char in line:
            if char == '(':
                open_parens += 1
                balanced_line.append(char)
            elif char == ')':
                if open_parens > 0:
                    open_parens -= 1
                    balanced_line.append(char)
            else:
                balanced_line.append(char)
        # Remove unmatched opening parentheses
        return ''.join(balanced_line).replace('(', '', open_parens)

    # Apply parenthesis balancing
    cleaned_lines = [balance_parentheses(line) for line in combined_lines]

    # Write cleaned content back to the file
    cleaned_content = "\n".join(cleaned_lines)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)

    print(f"File processed and cleaned: {file_path}")

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                clean_text_file_single(file_path)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)


File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1003.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10030.t

In [26]:
import os
import re
from difflib import SequenceMatcher

def normalize_line(line):
    """Normalizes lines for comparison by trimming whitespace and removing bullet-like prefixes."""
    line = re.sub(r'^[•●\-\–—\s]*', '', line.strip())  # Remove leading bullets, dashes, and whitespace
    return line

def detect_and_remove_redundant_blocks(lines):
    """Detects and removes redundant blocks of single-line and multi-line repetitions."""
    cleaned_lines = []
    i = 0
    while i < len(lines):
        current_line = lines[i].strip()
        
        # Check for a potential multi-line redundant block
        if i < len(lines) - 1:
            first_line = normalize_line(lines[i].strip())
            second_line = normalize_line(lines[i + 1].strip())
            combined_lines = first_line + ' ' + second_line

            # Check similarity between current line and the next combined line
            for j in range(i + 1, len(lines)):
                comparison_line = normalize_line(lines[j].strip())
                similarity = SequenceMatcher(None, first_line, comparison_line).ratio()

                # If similarity is high, skip the redundant lines
                if similarity > 0.85:
                    i = j + 1  # Skip to the line after the redundant block
                    break
            else:
                cleaned_lines.append(current_line)
                i += 1
                continue

        # If no redundancy is detected, add the current line
        cleaned_lines.append(current_line)
        i += 1

    return cleaned_lines

def merge_lines_with_period(content):
    """Merges lines if a period appears within the first 3 words of a new line."""
    lines = content.splitlines()
    merged_lines = []

    for i, line in enumerate(lines):
        if i > 0:
            words = line.split()
            # Check if there is a period within the first three words of the current line
            if len(words) >= 3 and any('.' in word for word in words[:3]):
                merged_lines[-1] = merged_lines[-1].rstrip() + ' ' + line.lstrip()
            else:
                merged_lines.append(line)
        else:
            merged_lines.append(line)
    
    return "\n".join(merged_lines)

def ensure_bullets_start_new_line(content):
    """Ensures that bullets '•' or '●' start a new line."""
    # Use regex to insert a newline before a bullet if it does not already start a line
    content = re.sub(r'(?<!\n)([•●])', r'\n\1', content)
    return content

def add_space_after_period(content):
    """Adds a space after a period if it is followed by a letter without a space."""
    content = re.sub(r'(\.)([A-Za-z])', r'\1 \2', content)
    return content

def clean_text_file(content):
    """Cleans and deduplicates the text content."""
    # Remove specific phrases
    content = re.sub(
        r'Use of UpToDate is subject to the\s*Subscription and License Agreement', 
        '', content
    )
    content = re.sub(
        r'Written by the doctors and editors at UpToDate\s*', 
        '', content
    )

    lines = content.splitlines()

    # Remove redundant blocks of single-line and multi-line repetitions
    cleaned_lines = detect_and_remove_redundant_blocks(lines)

    # Replace multiple bullets (including variations with whitespace) with a single bullet
    cleaned_content = "\n".join(cleaned_lines)
    cleaned_content = re.sub(r'(?:[•●\-–—]\s*){2,}', '• ', cleaned_content)

    # Remove "See" if it's the last word on a line
    cleaned_content = re.sub(r'\bSee\b\s*$', '', cleaned_content, flags=re.MULTILINE)

    # Merge lines if a period is found within the first 3 words
    cleaned_content = merge_lines_with_period(cleaned_content)

    # Ensure bullets start a new line
    cleaned_content = ensure_bullets_start_new_line(cleaned_content)

    # Add space after a period if followed by a letter without a space
    cleaned_content = add_space_after_period(cleaned_content)

    return cleaned_content

def process_txt_files_in_directory(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                print(f"Processing file: {file_path}")
                
                # Read file content
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                
                # Clean and deduplicate content
                cleaned_content = clean_text_file(content)
                
                # Write cleaned content back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_content)
                
                print(f"File processed and cleaned: {file_path}")

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_txt_files_in_directory(txt_directory)


Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
Processing file: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and cleaned: C:\Users\Windows\Desktop\DeepLearning\P

In [27]:
import os
import re

def add_space_after_bullets(file_path):
    """Adds a space after '●' or '•' if not followed by a space."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Add a space after '●' or '•' if not followed by a space
    content = re.sub(r'([●•])(\S)', r'\1 \2', content)

    # Write modified content back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"File processed and modified for bullet spacing: {file_path}")

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                add_space_after_bullets(file_path)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)

File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and modified for bullet spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1002.txt


In [28]:
import os
import re

def remove_space_in_urls(file_path):
    """Removes the space between a period and a lowercase letter, likely forming a URL."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Remove space between '.' and a lowercase letter if present
    content = re.sub(r'\.\s+([a-z])', r'.\1', content)

    # Write modified content back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"File processed and modified for URL-like spacing: {file_path}")

def process_all_txt_files(directory):
    """Processes all .txt files in the specified directory."""
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                remove_space_in_urls(file_path)

# Specify the directory containing the .txt files
txt_directory = r'C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt'

# Process all .txt files in the directory
process_all_txt_files(txt_directory)


File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\100.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1000.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10001.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10008.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\1001.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10014.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project\UTD2txt\10015.txt
File processed and modified for URL-like spacing: C:\Users\Windows\Desktop\DeepLearning\Project