# Data Preprocessing Pipeline
This notebook contains the complete data preprocessing code to check step by step.

### 1. Document Loader Code
Run this cell to define the code that loads `.txt` or `.pdf` files.

In [None]:
import os
import PyPDF2

def load_text_from_file(file_path: str) -> str:
    """
    Reads text data from a PDF or TXT file.
    Args:
        file_path (str): The path to the file.
    Returns:
        str: The extracted text.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} was not found.")

    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.txt':
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read()
    
    elif file_extension == '.pdf':
        text = ""
        try:
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    extracted = page.extract_text()
                    if extracted:
                        text += extracted + "\n"
        except Exception as e:
            print(f"Error reading PDF {file_path}: {e}")
        return text
    
    else:
        raise ValueError(f"Unsupported file format: {file_extension}. Only .txt and .pdf are supported.")


### 2. Text Cleaner Code
Run this cell to define the text cleaning functions (NLTK downloads, lowercasing, etc.).

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Setup NLTK resources
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
# Ensure punkt_tab is downloaded if needed by newer nltk versions
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

def clean_text(text: str) -> str:
    """
    Cleans the input text by:
    - Lowercasing
    - Removing special characters, punctuation, and extra whitespace
    - Removing stopwords

    Args:
        text (str): The raw text to clean.

    Returns:
        str: The cleaned text.
    """
    if not text:
        return ""
    
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and punctuation (keep alphanumeric and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)
    
    # Remove extra spaces that might be left
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


### 3. Segmenter Code
Run this cell to define the clause segmentation logic.

In [None]:
import re
from typing import List

def segment_into_clauses(text: str) -> List[str]:
    """
    Segments a full contract text into individual clauses.
    This uses a basic heuristic approach to split by common clause delimiters
    like numbered lists (1., 2.), bullet points, or double newlines.

    Args:
        text (str): The full raw text of the contract.

    Returns:
        List[str]: A list of segmented clauses.
    """
    if not text:
        return []

    # Strategy 1: Split by double newlines (paragraphs)
    # Often, distinct clauses are separated by empty lines.
    paragraphs = re.split(r'\n\s*\n', text)
    
    clauses = []
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # Strategy 2: Further split by numbering (e.g., "1.", "1.1", "a)") if they appear inside a paragraph
        # This regex looks for a newline/start followed by numbers/letters and a dot or parenthesis
        # Example: "\n 1. " or "^a) "
        # We replace these delimiters with a special marker to split easily
        
        # This is a simple regex for demonstration. Complex legal documents might need more robust parsing.
        delimiters = r'(?m)(^\s*\d+\.\d*\s*|^\s*[a-z]\)\s*|^\s*[ivx]+\.\s*)'
        
        # Split the paragraph by these delimiters
        parts = re.split(delimiters, para)
        
        # Reconstruct the clauses
        current_clause = ""
        for i, part in enumerate(parts):
            if re.match(delimiters, part):
                # If we have an existing clause, save it before starting a new one
                if current_clause:
                    clauses.append(current_clause.strip())
                current_clause = part # Start new clause with its delimiter
            else:
                current_clause += part
                
        if current_clause:
            clauses.append(current_clause.strip())

    # Filter out very short strings that are likely not real clauses
    final_clauses = [c for c in clauses if len(c.split()) > 3]
    
    return final_clauses


### 4. Demo Execution
This cell executes the pipeline on the sample contract data.

In [None]:
filepath = 'data/sample_contract.txt'

print(f"Loading document from: {filepath}...")
raw_text = load_text_from_file(filepath)
print(f"\nDocument loaded successfully. Extracted {len(raw_text)} characters.")
print("-" * 50)

print("Segmenting into clauses...")
clauses = segment_into_clauses(raw_text)
print(f"Found {len(clauses)} potential clauses.")
print("-" * 50)

print("Preprocessing top 5 clauses:")
for i, clause in enumerate(clauses[:5], 1):
    cleaned = clean_text(clause)
    print(f"\nClause {i} (Raw):")
    print(f"  {clause[:150]}")
    print(f"Clause {i} (Cleaned for ML):")
    print(f"  {cleaned[:150]}")
