# Feature Engineering
Transform the cleaned dataset into model-ready features and persist train/test splits.


This notebook expects the output of the Data Preparation step and applies deterministic feature transformations that are compatible across pandas, Modin, and Dask backends.


In [None]:
params = {
    'engine': 'pandas',
    'modin_engine': 'ray',
    'input_path': 'data/processed.csv',
    'train_output_path': 'data/train_features.csv',
    'test_output_path': 'data/test_features.csv',
    'feature_metadata_path': 'data/feature_metadata.json',
    'target_column': 'Survived',
    'test_size': 0.2,
    'random_state': 42
}


In [None]:
from pathlib import Path
import json

import pandas as pd
from sklearn.model_selection import train_test_split

from notebookml import BackendManager

engine = params.get('engine', 'pandas')
modin_engine = params.get('modin_engine', 'ray')
input_path = Path(params['input_path'])
train_output_path = Path(params['train_output_path'])
test_output_path = Path(params['test_output_path'])
metadata_path = Path(params['feature_metadata_path'])
target_column = params['target_column']
test_size = float(params.get('test_size', 0.2))
random_state = int(params.get('random_state', 42))

train_output_path.parent.mkdir(parents=True, exist_ok=True)
test_output_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)

backend = BackendManager(engine=engine, modin_engine=modin_engine)
df = backend.read_csv(str(input_path))
pdf = backend.to_pandas(df)

pdf['Title'] = (
    pdf['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).fillna('Unknown')
)
pdf['IsAlone'] = (pdf['FamilySize'] == 1).astype(int)
pdf['FarePerPerson'] = pdf['Fare'] / pdf['FamilySize'].clip(lower=1)

drop_columns = ['Name', 'Ticket', 'Cabin']
feature_df = pdf.drop(columns=[col for col in drop_columns if col in pdf.columns])
categorical_cols = [col for col in ['Sex', 'Embarked', 'Title'] if col in feature_df.columns]

target_series = feature_df[target_column]
feature_df = feature_df.drop(columns=[target_column])
feature_df = feature_df.fillna(0)
feature_df = pd.get_dummies(feature_df, columns=categorical_cols, drop_first=True)
feature_df[target_column] = target_series

X = feature_df.drop(columns=[target_column])
y = feature_df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=test_size,
    random_state=random_state,
    stratify=y,
)

train_df = X_train.copy()
train_df[target_column] = y_train
test_df = X_test.copy()
test_df[target_column] = y_test

if engine == 'dask':
    train_backend = backend.frame_namespace.from_pandas(train_df, npartitions=1)
    test_backend = backend.frame_namespace.from_pandas(test_df, npartitions=1)
elif engine == 'modin':
    train_backend = backend.frame_namespace.DataFrame(train_df)
    test_backend = backend.frame_namespace.DataFrame(test_df)
else:
    train_backend = train_df
    test_backend = test_df

backend.to_csv(train_backend, str(train_output_path), index=False)
backend.to_csv(test_backend, str(test_output_path), index=False)

metadata = {
    'engine': engine,
    'target_column': target_column,
    'feature_columns': X_train.columns.tolist(),
    'categorical_columns': categorical_cols,
    'test_size': test_size,
    'random_state': random_state,
}

with metadata_path.open('w') as fp:
    json.dump(metadata, fp, indent=2)

backend.close()
metadata
