# 02 - Feature Engineering & Vectorization

This notebook loads the curated feature table, engineers additional signals, prepares consistent train/test splits, and materializes the preprocessing pipeline used by downstream models.

## How to use
1. Ensure `01_data_ingestion.ipynb` has been executed successfully.
2. Adjust the configuration cell if artifact paths differ.
3. Run the notebook to create train/test splits and persist `preprocessor.joblib`.

In [None]:
# Optional: install dependencies required for this notebook.
# !pip install scikit-learn scipy joblib

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
NOTEBOOK_DIR = Path.cwd()
ARTIFACT_DIR = NOTEBOOK_DIR / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True)

METADATA_FEATURES_PATH = ARTIFACT_DIR / "metadata_features.csv"
TRAIN_DATA_PATH = ARTIFACT_DIR / "train_dataset.csv"
TEST_DATA_PATH = ARTIFACT_DIR / "test_dataset.csv"
PREPROCESSOR_PATH = ARTIFACT_DIR / "preprocessor.joblib"

TARGET_COLUMN = "business_capability"

TEXT_FEATURES = [
    "original_path_keywords",
]

CATEGORICAL_FEATURES = [
    "extension",
    "extension_family",
]

NUMERIC_FEATURES = [
    "original_path_depth",
    "file_size_bytes",
    "content_word_count",
]

BINARY_FEATURES: list[str] = []

FEATURE_COLUMNS = TEXT_FEATURES + CATEGORICAL_FEATURES + NUMERIC_FEATURES + BINARY_FEATURES

RANDOM_STATE = 42
TEST_SIZE = 0.2

if not METADATA_FEATURES_PATH.exists():
    raise FileNotFoundError("metadata_features.csv not found. Run 01_data_ingestion.ipynb first.")


In [None]:
REQUIRED_COLUMNS = FEATURE_COLUMNS + [TARGET_COLUMN]
ENCODING_CANDIDATES = ("utf-8", "utf-8-sig", "latin-1", "cp1252")


def load_metadata_features(path: Path) -> pd.DataFrame:
    last_error = None
    for encoding in ENCODING_CANDIDATES:
        try:
            df = pd.read_csv(path, encoding=encoding)
            if encoding != "utf-8":
                print(f"Loaded {path.name} using fallback encoding '{encoding}'.")
            return df
        except UnicodeDecodeError as unicode_error:
            last_error = unicode_error
        except pd.errors.ParserError as parser_error:
            last_error = parser_error
            try:
                df = pd.read_csv(
                    path,
                    encoding=encoding,
                    engine="python",
                    on_bad_lines="skip",
                )
                print(
                    f"Loaded {path.name} with python engine using encoding '{encoding}' after handling malformed lines."
                )
                return df
            except Exception as inner_error:
                last_error = inner_error
    message = f"Failed to decode or parse {path} with encodings {ENCODING_CANDIDATES}."
    if last_error is not None:
        raise RuntimeError(message) from last_error
    raise RuntimeError(message)


df = load_metadata_features(METADATA_FEATURES_PATH)

missing_columns = [col for col in REQUIRED_COLUMNS if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns in metadata dataset: {missing_columns}")

for column in TEXT_FEATURES + ["business_capability"]:
    if column in df:
        df[column] = df[column].fillna("").astype(str)

for column in CATEGORICAL_FEATURES:
    if column in df:
        df[column] = df[column].fillna("").astype(str)

for column in NUMERIC_FEATURES:
    if column in df:
        df[column] = pd.to_numeric(df[column], errors="coerce").fillna(0)

for column in BINARY_FEATURES:
    if column in df:
        df[column] = pd.to_numeric(df[column], errors="coerce").fillna(0).astype(int)

text_non_empty_counts = {}
ACTIVE_TEXT_FEATURES = []
for column in TEXT_FEATURES:
    if column in df:
        non_empty = int((df[column].astype(str).str.strip() != "").sum())
        text_non_empty_counts[column] = non_empty
        if non_empty > 0:
            ACTIVE_TEXT_FEATURES.append(column)

print("Text feature non-empty counts:", text_non_empty_counts)

inactive_text_features = [col for col, count in text_non_empty_counts.items() if count == 0]
if inactive_text_features:
    print("Skipping empty text features:", inactive_text_features)
else:
    print("All text features contain content.")

FEATURE_COLUMNS = ACTIVE_TEXT_FEATURES + CATEGORICAL_FEATURES + NUMERIC_FEATURES + BINARY_FEATURES

if not ACTIVE_TEXT_FEATURES:
    print("Warning: no text features contain content; TF-IDF vectorizers will be skipped.")

df[FEATURE_COLUMNS].head()

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df[TARGET_COLUMN],
    random_state=RANDOM_STATE,
)

train_df.to_csv(TRAIN_DATA_PATH, index=False)
test_df.to_csv(TEST_DATA_PATH, index=False)

print(f"Train rows: {len(train_df)}")
print(f"Test rows: {len(test_df)}")
print("Class distribution (train):")
print(train_df[TARGET_COLUMN].value_counts(normalize=True))
print("Class distribution (test):")
print(test_df[TARGET_COLUMN].value_counts(normalize=True))

In [None]:
content_vectorizer = TfidfVectorizer(
    max_features=35000,
    ngram_range=(1, 2),
    min_df=1,
    strip_accents="unicode",
)

path_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents="unicode",
)

original_path_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents="unicode",
)

file_name_vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=1,
    strip_accents="unicode",
)

notes_vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=1,
    strip_accents="unicode",
)

transformers = []

if "content_text" in ACTIVE_TEXT_FEATURES:
    transformers.append(("content_tfidf", content_vectorizer, "content_text"))
if "path_keywords" in ACTIVE_TEXT_FEATURES:
    transformers.append(("path_tfidf", path_vectorizer, "path_keywords"))
if "original_path_keywords" in ACTIVE_TEXT_FEATURES:
    transformers.append(("original_path_tfidf", original_path_vectorizer, "original_path_keywords"))
if "file_name_keywords" in ACTIVE_TEXT_FEATURES:
    transformers.append(("file_name_tfidf", file_name_vectorizer, "file_name_keywords"))
if "notes_text" in ACTIVE_TEXT_FEATURES:
    transformers.append(("notes_tfidf", notes_vectorizer, "notes_text"))

if CATEGORICAL_FEATURES:
    transformers.append(("categorical", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL_FEATURES))

numeric_like_features = NUMERIC_FEATURES + BINARY_FEATURES
if numeric_like_features:
    transformers.append(("numeric", StandardScaler(with_mean=False), numeric_like_features))

if not transformers:
    raise ValueError("No transformers configured. Verify feature availability.")

preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder="drop",
    sparse_threshold=0.1,
)

In [None]:
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f"Saved preprocessing configuration to {PREPROCESSOR_PATH}")

In [None]:
from sklearn.base import clone

preprocessor_check = clone(preprocessor)
feature_matrix_sample = preprocessor_check.fit_transform(train_df[FEATURE_COLUMNS], train_df[TARGET_COLUMN])
print("Sample feature matrix shape (train split):", feature_matrix_sample.shape)