In [2]:
import re

# Step 1: Read the raw text file
with open("C:/Users/LENOVE/Desktop/Data Engineers/raw_messy_text.txt", "r", encoding="utf-8") as f:
    raw_text = f.readlines()

print(f"Total lines before cleaning: {len(raw_text)}")


Total lines before cleaning: 58


In [3]:
# Step 2: Remove empty lines and strip extra spaces
cleaned = [line.strip() for line in raw_text if line.strip() != ""]

In [4]:
# Step 3: Remove HTML tags
cleaned = [re.sub(r"<.*?>", "", line) for line in cleaned]

In [5]:
# Step 4: Normalize multiple spaces and tabs
cleaned = [re.sub(r"\s+", " ", line) for line in cleaned]

In [7]:
# Step 5: Remove duplicate lines
cleaned = list(dict.fromkeys(cleaned))

# Step 6: Remove random punctuation clutter (like ...., !!!, --- etc.)
cleaned = [re.sub(r"[.]{2,}|[!?]{2,}|[-=]{2,}", ".", line) for line in cleaned]

# Step 7: (Optional) Remove lines with unwanted symbols or noise
# e.g. remove lines starting with '---', '>>>', or containing broken JSON
cleaned = [line for line in cleaned if not re.match(r"^[-=]{2,}|^>>>", line)]
cleaned = [line for line in cleaned if not re.search(r"Broken JSON", line)]

# Step 8: Save the cleaned output to a new text file
with open("C:/Users/LENOVE/Desktop/Data Engineers/cleaned_text.txt", "w", encoding="utf-8") as f:
    for line in cleaned:
        f.write(line + "\n")

print("✅ Cleaning complete!")
print(f"Total lines after cleaning: {len(cleaned)}")
print("Cleaned file saved as: cleaned_text.txt")

✅ Cleaning complete!
Total lines after cleaning: 36
Cleaned file saved as: cleaned_text.txt


In [8]:
import re

# Load cleaned text
with open("C:/Users/LENOVE/Desktop/Data Engineers/cleaned_text.txt", "r", encoding="utf-8") as f:
    text = f.read()


In [9]:
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
print("Emails found:", emails)


Emails found: ['john.doe@example.com', 'jane_smith@domain.co.uk']


In [10]:
phones = re.findall(r"\+?\d[\d().\-\s]{7,}\d", text)
print("Phone numbers found:", phones)


Phone numbers found: ['2025-10-01 08', '2025-10-02 09', '10\n2025-10-02 09', '+1 (555) 123-4567\n555.123.4568']


In [11]:
urls = re.findall(r"https?://[^\s]+", text)
print("URLs found:", urls)


URLs found: ['https://example.com/page?ref=abc', 'http://localhost:8000/test']


In [12]:
dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
times = re.findall(r"\d{2}:\d{2}:\d{2}", text)

print("Dates found:", dates)
print("Times found:", times)


Dates found: ['2025-10-01', '2025-10-02', '2025-10-02']
Times found: ['08:15:23', '09:00:00', '09:00:01']


In [13]:
with open("C:/Users/LENOVE/Desktop/Data Engineers/extracted_data.txt", "w", encoding="utf-8") as f:
    f.write("Emails:\n" + "\n".join(emails) + "\n\n")
    f.write("Phone Numbers:\n" + "\n".join(phones) + "\n\n")
    f.write("URLs:\n" + "\n".join(urls) + "\n\n")
    f.write("Dates:\n" + "\n".join(dates) + "\n\n")
    f.write("Times:\n" + "\n".join(times) + "\n\n")

print("✅ Extraction complete! File saved as extracted_data.txt")

✅ Extraction complete! File saved as extracted_data.txt


In [14]:
import re
import csv

# Step 1: Load the text file
with open("cleaned_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Step 2: Define regex patterns for extraction
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
phones = re.findall(r"\+?\d[\d().\-\s]{7,}\d", text)
urls = re.findall(r"https?://[^\s]+", text)
dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
times = re.findall(r"\d{2}:\d{2}:\d{2}", text)

# Step 3: Prepare to save extracted data
# (We'll pad lists so all columns have same length)
max_len = max(len(emails), len(phones), len(urls), len(dates), len(times))
def pad(lst):
    return lst + [""] * (max_len - len(lst))

emails, phones, urls, dates, times = map(pad, [emails, phones, urls, dates, times])

# Step 4: Save into CSV
with open("C:/Users/LENOVE/Desktop/Data Engineers/extracted_summary.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Email", "Phone", "URL", "Date", "Time"])
    for i in range(max_len):
        writer.writerow([
            emails[i],
            phones[i],
            urls[i],
            dates[i],
            times[i]
        ])

print("✅ Data extracted and saved as extracted_summary.csv!")


✅ Data extracted and saved as extracted_summary.csv!


In [4]:
import re
import csv
import os

# === CONFIGURATION ===
input_folder = r"C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file"
output_folder = r"C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file"

# Create folders if not exist
os.makedirs(input_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

# === FUNCTION TO EXTRACT DATA FROM TEXT ===
def extract_from_text(text):
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    phones = re.findall(r"\+?\d[\d().\-\s]{7,}\d", text)
    urls = re.findall(r"https?://[^\s]+", text)
    dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
    times = re.findall(r"\d{2}:\d{2}:\d{2}", text)

    print("Emails found:", emails)
    print("Phones found:", phones)
    print("URLs found:", urls)
    print("Dates found:", dates)
    print("Times found:", times)

    max_len = max(len(emails), len(phones), len(urls), len(dates), len(times), 1)
    def pad(lst): return lst + [""] * (max_len - len(lst))
    return pad(emails), pad(phones), pad(urls), pad(dates), pad(times)

# === MAIN PIPELINE ===
print("Full path being used:", os.path.abspath(input_folder))
print("Files detected:", os.listdir(input_folder))
print("\n🔍 Starting extraction process...\n")

for filename in os.listdir(input_folder):
    if filename.endswith(".txt") and "cleaned" not in filename.lower() and "extract" not in filename.lower():
        file_path = os.path.join(input_folder, filename)
        print(f"\nProcessing: {file_path}")

        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        print("File size:", len(text))
        print("File preview:", text[:200])

        emails, phones, urls, dates, times = extract_from_text(text)

        csv_name = filename.replace(".txt", "_extracted.csv")
        csv_path = os.path.join(output_folder, csv_name)

        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Email", "Phone", "URL", "Date", "Time"])
            for i in range(len(emails)):
                writer.writerow([emails[i], phones[i], urls[i], dates[i], times[i]])

        if any([emails, phones, urls, dates, times]):
            print(f"✅ Data extracted and saved to: {csv_path}")
        else:
            print(f"⚠️ No matches found in: {filename}")

print("\n🎯 All eligible files processed successfully!")


Full path being used: C:\Users\LENOVE\Desktop\Data Engineers\Python\Text clean file
Files detected: ['.ipynb_checkpoints', 'Cleaned,extract file.ipynb', 'cleaned_text.txt', 'extracted_data.txt', 'extracted_summary.csv', 'raw_messy_text.txt']

🔍 Starting extraction process...


Processing: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text.txt
File size: 1172
File preview: ERROR: 2025-10-01 08:15:23 - Failed to connect to DB (timeout)
User:  john.doe@example.com   Action: login
----
<html><body><h1>Report</h1><p>Sales increased by 5%.</p></body></html>

Todo:   fix,norm
Emails found: ['john.doe@example.com', 'jane_smith@domain.co.uk', 'jane_smith@domain.co.uk']
Phones found: ['2025-10-01 08', '2025-10-02 09', '10\n2025-10-02 09', '+1 (555) 123-4567\n555.123.4568']
URLs found: ['https://example.com/page?ref=abc', 'http://localhost:8000/test']
Dates found: ['2025-10-01', '2025-10-02', '2025-10-02']
Times found: ['08:15:23', '09:00:00', '09:00:01']
✅ Data extracte

In [1]:
pip install schedule

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import re
import csv
import os
import time
import schedule

# === CONFIGURATION ===
input_folder = r"C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file"
output_folder = r"C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file"

# Create folders if not exist
os.makedirs(input_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

# === FUNCTION TO EXTRACT DATA FROM TEXT ===
def extract_from_text(text):
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    phones = re.findall(r"\+?\d[\d().\-\s]{7,}\d", text)
    urls = re.findall(r"https?://[^\s]+", text)
    dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
    times = re.findall(r"\d{2}:\d{2}:\d{2}", text)

    max_len = max(len(emails), len(phones), len(urls), len(dates), len(times), 1)
    def pad(lst): return lst + [""] * (max_len - len(lst))
    return pad(emails), pad(phones), pad(urls), pad(dates), pad(times)

# === MAIN PIPELINE FUNCTION ===
def run_pipeline():
    print("\n===============================")
    print("🚀 Running automated extraction job...")
    print("===============================\n")

    files_processed = 0

    for filename in os.listdir(input_folder):
        # ✅ Process only .txt files that aren’t cleaned/extracted
        if filename.endswith(".txt") and "cleaned" not in filename.lower() and "extract" not in filename.lower():
            file_path = os.path.join(input_folder, filename)
            print(f"Processing: {file_path}")

            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            emails, phones, urls, dates, times = extract_from_text(text)

            # ✅ Generate fixed output file name (overwrite if exists)
            csv_name = filename.replace(".txt", "_extracted.csv")
            csv_path = os.path.join(output_folder, csv_name)

            with open(csv_path, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["Email", "Phone", "URL", "Date", "Time"])
                for i in range(len(emails)):
                    writer.writerow([emails[i], phones[i], urls[i], dates[i], times[i]])

            print(f"✅ File saved/updated: {csv_path}")
            files_processed += 1

    if files_processed == 0:
        print("⚠️ No eligible text files found to process.")
    else:
        print(f"🎯 {files_processed} file(s) processed successfully!\n")

# === SCHEDULING ===
schedule.every(5).minutes.do(run_pipeline)  # Run every 5 minutes

print("⏰ Automation started! The extraction will run every 5 minutes.\n")

# Run once immediately at start
run_pipeline()

# Keep running indefinitely
while True:
    schedule.run_pending()
    time.sleep(1)


⏰ Automation started! The extraction will run every 5 minutes.


🚀 Running automated extraction job...

Processing: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text.txt
✅ File saved/updated: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text_extracted.csv
🎯 1 file(s) processed successfully!


🚀 Running automated extraction job...

Processing: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text.txt
✅ File saved/updated: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text_extracted.csv
🎯 1 file(s) processed successfully!


🚀 Running automated extraction job...

Processing: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text.txt
✅ File saved/updated: C:/Users/LENOVE/Desktop/Data Engineers/Python/Text clean file\raw_messy_text_extracted.csv
🎯 1 file(s) processed successfully!


🚀 Running automated extraction job...

Processing: C:/Users/LENOVE/Desktop/Data Engine