# 01 - Data Ingestion & Metadata Extraction

This notebook ingests the lifecycle catalog, resolves file locations, extracts metadata, and materializes a curated feature table for downstream modeling.

## How to use
1. Update `EXCEL_PATH` and `RAW_BASE_DIR` in the configuration cell if needed.
2. Run the notebook to extract features and write `artifacts/metadata_features.csv`.
3. Inspect the preview cells to validate coverage, missing files, and extracted text.

In [None]:
# Optional: install dependencies if they are not already available in your environment.
# !pip install pandas openpyxl python-magic-bin python-docx pdfminer.six python-pptx tqdm

In [None]:
from __future__ import annotations

import re
import sys
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

try:
    import magic  # type: ignore
except ImportError:
    magic = None  # optional dependency, used for MIME detection when available

try:
    from docx import Document  # type: ignore
except ImportError:
    Document = None

try:
    from pptx import Presentation  # type: ignore
except ImportError:
    Presentation = None

try:
    from pdfminer.high_level import extract_text as pdf_extract_text  # type: ignore
except ImportError:
    pdf_extract_text = None

try:
    from tqdm.auto import tqdm  # type: ignore
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path('..').resolve()
EXCEL_PATH = PROJECT_ROOT / 'assets' / 'data_lifecycle.xlsx'  # TODO: update if the catalog lives elsewhere
RAW_BASE_DIR = PROJECT_ROOT  # Base directory to resolve entries from `File Path`
OUTPUT_DIR = Path.cwd() / 'artifacts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DATA_PATH = OUTPUT_DIR / 'metadata_features.csv'

CATALOG_COLUMNS = ['Original File Path', 'File Path', 'Business Capability']

print(f'Excel catalog path: {EXCEL_PATH}')
print(f'File base directory: {RAW_BASE_DIR}')

In [None]:
def load_catalog(path: Path, expected_columns: Iterable[str]) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f'Catalog not found: {path}')
    df = pd.read_excel(path)
    missing = [col for col in expected_columns if col not in df.columns]
    if missing:
        raise ValueError(f'Missing columns in catalog: {missing}')
    df = df[list(expected_columns)].copy()
    df = df.rename(columns={
        'Original File Path': 'original_file_path',
        'File Path': 'file_path',
        'Business Capability': 'business_capability',
    })
    df['original_file_path'] = df['original_file_path'].astype(str).str.strip()
    df['file_path'] = df['file_path'].astype(str).str.strip()
    df['business_capability'] = df['business_capability'].astype(str).str.strip()
    return df

catalog_df = load_catalog(EXCEL_PATH, CATALOG_COLUMNS)
catalog_df.head()

In [None]:
def resolve_path(raw_value: str, base_dir: Path) -> Path:
    candidate = Path(raw_value)
    if not candidate.is_absolute():
        candidate = base_dir / candidate
    return candidate.resolve()


def to_keywords(path: Path) -> str:
    tokens = []
    for part in path.parts:
        cleaned = re.sub(r'[^A-Za-z0-9]+', ' ', part)
        cleaned = cleaned.strip().lower()
        if cleaned:
            tokens.append(cleaned)
    return ' '.join(tokens)


def path_depth(path: Path) -> int:
    return len(path.parts)

catalog_df['resolved_path'] = catalog_df['file_path'].apply(lambda value: resolve_path(value, RAW_BASE_DIR))
catalog_df['original_resolved_path'] = catalog_df['original_file_path'].apply(lambda value: resolve_path(value, RAW_BASE_DIR))
catalog_df['file_exists'] = catalog_df['resolved_path'].apply(lambda p: p.exists())
catalog_df['extension'] = catalog_df['resolved_path'].apply(lambda p: p.suffix.lower())
catalog_df['file_name'] = catalog_df['resolved_path'].apply(lambda p: p.name)
catalog_df['file_stem'] = catalog_df['resolved_path'].apply(lambda p: p.stem)
catalog_df['path_depth'] = catalog_df['resolved_path'].apply(path_depth)
catalog_df['original_path_depth'] = catalog_df['original_resolved_path'].apply(path_depth)
catalog_df['path_keywords'] = catalog_df['resolved_path'].apply(to_keywords)
catalog_df['original_path_keywords'] = catalog_df['original_resolved_path'].apply(to_keywords)
catalog_df['path_token_count'] = catalog_df['path_keywords'].str.split().map(len)

catalog_df.head()

In [None]:
TEXT_EXTENSIONS = {
    '.txt', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml', '.ini', '.cfg', '.conf',
    '.md', '.rst', '.sql', '.py', '.java', '.js', '.html', '.htm', '.css', '.log'
}
MAX_CHAR_LENGTH = 50000


def read_text_file(path: Path) -> str:
    try:
        return path.read_text(encoding='utf-8', errors='ignore')
    except Exception:
        try:
            return path.read_text(encoding='latin-1', errors='ignore')
        except Exception:
            return ''


def extract_text_from_file(path: Path) -> str:
    if not path.exists() or not path.is_file():
        return ''

    suffix = path.suffix.lower()
    text_content = ''

    try:
        if suffix in TEXT_EXTENSIONS:
            text_content = read_text_file(path)

        elif suffix in {'.xls', '.xlsx'}:
            try:
                sheets = pd.read_excel(path, sheet_name=None, dtype=str, nrows=200)
                text_chunks = []
                for sheet_name, sheet_df in sheets.items():
                    text_chunks.append(f'__sheet__: {sheet_name}')
                    for _, row in sheet_df.iterrows():
                        row_text = ' '.join(str(value) for value in row if pd.notna(value))
                        if row_text:
                            text_chunks.append(row_text)
                text_content = '
'.join(text_chunks)
            except Exception:
                text_content = ''

        elif suffix == '.pdf' and pdf_extract_text is not None:
            try:
                text_content = pdf_extract_text(str(path))
            except Exception:
                text_content = ''

        elif suffix == '.docx' and Document is not None:
            try:
                document = Document(path)
                paragraphs = [paragraph.text for paragraph in document.paragraphs if paragraph.text]
                text_content = '
'.join(paragraphs)
            except Exception:
                text_content = ''

        elif suffix == '.pptx' and Presentation is not None:
            try:
                presentation = Presentation(path)
                texts = []
                for slide in presentation.slides:
                    for shape in slide.shapes:
                        if hasattr(shape, 'text') and shape.text:
                            texts.append(shape.text)
                text_content = '
'.join(texts)
            except Exception:
                text_content = ''

        else:
            # Fallback strategies: try naive text read, optionally MIME-based hints
            if suffix not in TEXT_EXTENSIONS and magic is not None:
                try:
                    mime_type = magic.from_file(str(path), mime=True)
                    if mime_type and 'text' in mime_type:
                        text_content = read_text_file(path)
                except Exception:
                    text_content = ''
            if not text_content:
                text_content = read_text_file(path)

    except Exception:
        text_content = ''

    text_content = re.sub(r'\s+', ' ', text_content).strip()
    return text_content[:MAX_CHAR_LENGTH]


if TQDM_AVAILABLE:
    from tqdm.auto import tqdm  # type: ignore
    tqdm.pandas(desc='Extracting file content')
    catalog_df['content_text'] = catalog_df['resolved_path'].progress_apply(extract_text_from_file)
else:
    catalog_df['content_text'] = catalog_df['resolved_path'].apply(extract_text_from_file)

catalog_df['content_char_len'] = catalog_df['content_text'].str.len().fillna(0)
catalog_df['content_word_count'] = catalog_df['content_text'].str.split().map(len).fillna(0)

catalog_df.head()

In [None]:
feature_df = catalog_df.copy()
feature_df['resolved_path'] = feature_df['resolved_path'].astype(str)
feature_df['original_resolved_path'] = feature_df['original_resolved_path'].astype(str)
feature_df['file_exists'] = feature_df['file_exists'].astype(int)

feature_df.to_csv(OUTPUT_DATA_PATH, index=False)
print(f'Saved curated dataset to {OUTPUT_DATA_PATH}')

try:
    parquet_path = OUTPUT_DIR / 'metadata_features.parquet'
    feature_df.to_parquet(parquet_path, index=False)
    print(f'(Optional) Saved Parquet dataset to {parquet_path}')
except Exception as parquet_error:
    print('Parquet export skipped:', parquet_error)

feature_df[['resolved_path', 'file_exists', 'extension', 'path_depth', 'content_char_len', 'business_capability']].head()

In [None]:
feature_df['business_capability'].value_counts()