### Handling Unstructured Data with Python
**Description**: Extract structured data from unstructured text using Python.

**Steps**:
1. Load and analyze an unstructured text document.
2. Extract information using regex.

In [None]:
# write your code from here

In [2]:
import re
import os

def read_file(file_path):
    """Reads content from a file with error handling."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File '{file_path}' does not exist.")
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        if not content.strip():
            raise ValueError("File is empty.")
        return content

def extract_emails(text):
    return re.findall(r'\b[\w.-]+?@\w+?\.\w+?\b', text)

def extract_phone_numbers(text):
    return re.findall(r'\b(?:\+?\d{1,2}\s?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', text)

def extract_dates(text):
    return re.findall(r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},\s\d{4})\b', text)

def extract_names(text):
    return re.findall(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)

def main(file_path):
    try:
        text = read_file(file_path)
        print("Emails:", extract_emails(text))
        print("Phone Numbers:", extract_phone_numbers(text))
        print("Dates:", extract_dates(text))
        print("Names:", extract_names(text))
    except Exception as e:
        print("Error:", e)

# Example usage
if __name__ == "__main__":
    main('sample_unstructured_text.txt')  # Replace with your file


Error: File 'sample_unstructured_text.txt' does not exist.


In [3]:
import unittest

class TestRegexExtractors(unittest.TestCase):
    def setUp(self):
        self.text = """
        John Doe, born on 12/05/1990, contacted via john.doe@example.com or (123) 456-7890.
        He submitted the application on March 10, 2024. Reach out to jane_smith@work.net or +91 987-654-3210.
        """

    def test_email_extraction(self):
        self.assertEqual(
            extract_emails(self.text),
            ['john.doe@example.com', 'jane_smith@work.net']
        )

    def test_phone_number_extraction(self):
        self.assertIn('(123) 456-7890', extract_phone_numbers(self.text))
        self.assertIn('+91 987-654-3210', extract_phone_numbers(self.text))

    def test_date_extraction(self):
        self.assertIn('12/05/1990', extract_dates(self.text))
        self.assertIn('March 10, 2024', extract_dates(self.text))

    def test_name_extraction(self):
        self.assertIn('John Doe', extract_names(self.text))
        self.assertIn('Jane Smith', extract_names(self.text))  # Edge: try to test lowercase/variants

if __name__ == '__main__':
    unittest.main()


usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v35cc5f0b4f308cf5185c4048ae41d822ff181db2d.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
