# 02 - Feature Engineering & Vectorization

This notebook loads the curated feature table, engineers additional signals, prepares consistent train/test splits, and materializes the preprocessing pipeline used by downstream models.

## How to use
1. Ensure `01_data_ingestion.ipynb` has been executed successfully.
2. Update the configuration cell if the artifact paths differ.
3. Run the notebook to create train/test splits and persist `preprocessor.joblib`.

In [None]:
# Optional: install dependencies required for vectorization.
# !pip install scikit-learn scipy joblib

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

In [None]:
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
NOTEBOOK_DIR = Path.cwd()
ARTIFACT_DIR = NOTEBOOK_DIR / 'artifacts'
ARTIFACT_DIR.mkdir(exist_ok=True)

METADATA_FEATURES_PATH = ARTIFACT_DIR / 'metadata_features.csv'
TRAIN_DATA_PATH = ARTIFACT_DIR / 'train_dataset.csv'
TEST_DATA_PATH = ARTIFACT_DIR / 'test_dataset.csv'
PREPROCESSOR_PATH = ARTIFACT_DIR / 'preprocessor.joblib'

TARGET_COLUMN = 'business_capability'
FEATURE_COLUMNS = [
    'content_text',
    'path_keywords',
    'extension',
    'path_depth',
    'original_path_depth',
    'path_token_count',
    'content_char_len',
    'content_word_count',
    'file_exists',
]

RANDOM_STATE = 42
TEST_SIZE = 0.2

if not METADATA_FEATURES_PATH.exists():
    raise FileNotFoundError('metadata_features.csv not found. Run 01_data_ingestion.ipynb first.')


In [None]:
df = pd.read_csv(METADATA_FEATURES_PATH)

for column in ['content_text', 'path_keywords', 'extension']:
    if column in df:
        df[column] = df[column].fillna('').astype(str)

numeric_columns = ['path_depth', 'original_path_depth', 'path_token_count', 'content_char_len', 'content_word_count', 'file_exists']
for column in numeric_columns:
    if column in df:
        df[column] = df[column].fillna(0)

if TARGET_COLUMN not in df.columns:
    raise ValueError(f'Missing target column `{TARGET_COLUMN}` in metadata dataset.')

df[FEATURE_COLUMNS].head()

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df[TARGET_COLUMN],
    random_state=RANDOM_STATE,
)

train_df.to_csv(TRAIN_DATA_PATH, index=False)
test_df.to_csv(TEST_DATA_PATH, index=False)

print(f'Train rows: {len(train_df)}')
print(f'Test rows: {len(test_df)}')
print('Class distribution (train):')
print(train_df[TARGET_COLUMN].value_counts(normalize=True))
print('Class distribution (test):')
print(test_df[TARGET_COLUMN].value_counts(normalize=True))

In [None]:
text_vectorizer_content = TfidfVectorizer(
    max_features=25000,
    ngram_range=(1, 2),
    min_df=2,
    strip_accents='unicode',
)

text_vectorizer_path = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    min_df=1,
    strip_accents='unicode',
)

categorical_features = ['extension']
numeric_features = ['path_depth', 'original_path_depth', 'path_token_count', 'content_char_len', 'content_word_count', 'file_exists']

preprocessor = ColumnTransformer(
    transformers=[
        ('content_tfidf', text_vectorizer_content, 'content_text'),
        ('path_tfidf', text_vectorizer_path, 'path_keywords'),
        ('extension_ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', StandardScaler(with_mean=False), numeric_features),
    ],
    remainder='drop',
    sparse_threshold=0.3,
)

In [None]:
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f'Saved preprocessing configuration to {PREPROCESSOR_PATH}')

In [None]:
# Optional sanity check: clone the preprocessor and fit on the training split to confirm it runs end-to-end.
from sklearn.base import clone

preprocessor_check = clone(preprocessor)
feature_matrix_sample = preprocessor_check.fit_transform(train_df[FEATURE_COLUMNS], train_df[TARGET_COLUMN])
print('Sample feature matrix shape (train split):', feature_matrix_sample.shape)