# 02 - Feature Engineering & Vectorization

This notebook loads the curated feature table, engineers additional signals, prepares consistent train/test splits, and materializes the preprocessing pipeline used by downstream models.

## How to use
1. Ensure `01_data_ingestion.ipynb` has been executed successfully.
2. Adjust the configuration cell if artifact paths differ.
3. Run the notebook to create train/test splits and persist `preprocessor.joblib`.

In [None]:
# Optional: install dependencies required for this notebook.
# !pip install scikit-learn scipy joblib

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
NOTEBOOK_DIR = Path.cwd()
ARTIFACT_DIR = NOTEBOOK_DIR / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True)

METADATA_FEATURES_PATH = ARTIFACT_DIR / "metadata_features.csv"
TRAIN_DATA_PATH = ARTIFACT_DIR / "train_dataset.csv"
TEST_DATA_PATH = ARTIFACT_DIR / "test_dataset.csv"
PREPROCESSOR_PATH = ARTIFACT_DIR / "preprocessor.joblib"

TARGET_COLUMN = "business_capability"

TEXT_FEATURES = [
    "content_text",
    "path_keywords",
    "original_path_keywords",
    "file_name_keywords",
    "notes_text",
]

CATEGORICAL_FEATURES = [
    "extension",
    "extension_family",
    "record_type",
    "retention_code",
]

NUMERIC_FEATURES = [
    "path_depth",
    "original_path_depth",
    "path_token_count",
    "original_path_token_count",
    "path_char_len",
    "original_path_char_len",
    "file_name_char_len",
    "file_name_digit_count",
    "file_size_bytes",
    "modified_time_epoch",
    "content_char_len",
    "content_word_count",
    "notes_word_count",
]

BINARY_FEATURES = [
    "file_exists",
    "has_content",
]

FEATURE_COLUMNS = TEXT_FEATURES + CATEGORICAL_FEATURES + NUMERIC_FEATURES + BINARY_FEATURES

RANDOM_STATE = 42
TEST_SIZE = 0.2

if not METADATA_FEATURES_PATH.exists():
    raise FileNotFoundError("metadata_features.csv not found. Run 01_data_ingestion.ipynb first.")

In [None]:
df = pd.read_csv(METADATA_FEATURES_PATH)

for column in TEXT_FEATURES + ["business_capability"]:
    if column in df:
        df[column] = df[column].fillna("").astype(str)

for column in CATEGORICAL_FEATURES:
    if column in df:
        df[column] = df[column].fillna("").astype(str)

for column in NUMERIC_FEATURES:
    if column in df:
        df[column] = pd.to_numeric(df[column], errors="coerce").fillna(0)

for column in BINARY_FEATURES:
    if column in df:
        df[column] = pd.to_numeric(df[column], errors="coerce").fillna(0).astype(int)

df[FEATURE_COLUMNS].head()

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df[TARGET_COLUMN],
    random_state=RANDOM_STATE,
)

train_df.to_csv(TRAIN_DATA_PATH, index=False)
test_df.to_csv(TEST_DATA_PATH, index=False)

print(f"Train rows: {len(train_df)}")
print(f"Test rows: {len(test_df)}")
print("Class distribution (train):")
print(train_df[TARGET_COLUMN].value_counts(normalize=True))
print("Class distribution (test):")
print(test_df[TARGET_COLUMN].value_counts(normalize=True))

In [None]:
content_vectorizer = TfidfVectorizer(
    max_features=35000,
    ngram_range=(1, 2),
    min_df=2,
    strip_accents="unicode",
)

path_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents="unicode",
)

original_path_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents="unicode",
)

file_name_vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=1,
    strip_accents="unicode",
)

notes_vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=1,
    strip_accents="unicode",
)

preprocessor = ColumnTransformer(
    transformers=[
        ("content_tfidf", content_vectorizer, "content_text"),
        ("path_tfidf", path_vectorizer, "path_keywords"),
        ("original_path_tfidf", original_path_vectorizer, "original_path_keywords"),
        ("file_name_tfidf", file_name_vectorizer, "file_name_keywords"),
        ("notes_tfidf", notes_vectorizer, "notes_text"),
        ("categorical", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL_FEATURES),
        ("numeric", StandardScaler(with_mean=False), NUMERIC_FEATURES + BINARY_FEATURES),
    ],
    remainder="drop",
    sparse_threshold=0.1,
)

In [None]:
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f"Saved preprocessing configuration to {PREPROCESSOR_PATH}")

In [None]:
from sklearn.base import clone

preprocessor_check = clone(preprocessor)
feature_matrix_sample = preprocessor_check.fit_transform(train_df[FEATURE_COLUMNS], train_df[TARGET_COLUMN])
print("Sample feature matrix shape (train split):", feature_matrix_sample.shape)