# Data Preparation
Clean and stage the Titanic dataset for downstream modeling.


This notebook is parameterised by Papermill and supports pandas, Modin on Ray, and Dask dataframe engines through the shared [`BackendManager`](../notebookml/backends.py).
The default dataset is the classic Kaggle Titanic competition data.


In [None]:
params = {
    'engine': 'pandas',
    'modin_engine': 'ray',
    'dataset_url': 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv',
    'output_path': 'data/processed.csv',
    'summary_path': 'data/data_prep_summary.json'
}


In [None]:
from pathlib import Path
import json

from notebookml import BackendManager

engine = params.get('engine', 'pandas')
modin_engine = params.get('modin_engine', 'ray')
dataset_url = params['dataset_url']
output_path = Path(params['output_path'])
summary_path = Path(params['summary_path'])

output_path.parent.mkdir(parents=True, exist_ok=True)
summary_path.parent.mkdir(parents=True, exist_ok=True)

backend = BackendManager(engine=engine, modin_engine=modin_engine)
df = backend.read_csv(dataset_url)
pdf = backend.to_pandas(df)

# ------------------------------------------------------------------
# Basic cleaning
pdf = pdf.drop_duplicates()

numeric_cols = [col for col in pdf.select_dtypes(include=['number']).columns if col != 'Survived']
if numeric_cols:
    medians = pdf[numeric_cols].median()
    pdf[numeric_cols] = pdf[numeric_cols].fillna(medians)

categorical_cols = [
    col
    for col in pdf.select_dtypes(include=['object', 'category']).columns
    if col not in {'Ticket', 'Name'}
]
if categorical_cols:
    modes = pdf[categorical_cols].mode(dropna=True).iloc[0]
    pdf[categorical_cols] = pdf[categorical_cols].fillna(modes)

pdf['CabinKnown'] = pdf['Cabin'].notnull().astype(int)
pdf['FamilySize'] = pdf['SibSp'] + pdf['Parch'] + 1

# Convert back to the requested backend before saving.
if engine == 'dask':
    df_clean = backend.frame_namespace.from_pandas(pdf, npartitions=1)
elif engine == 'modin':
    df_clean = backend.frame_namespace.DataFrame(pdf)
else:
    df_clean = pdf

backend.to_csv(df_clean, str(output_path), index=False)

summary = {
    'engine': engine,
    'rows': int(pdf.shape[0]),
    'columns': pdf.columns.tolist(),
}

with summary_path.open('w') as fp:
    json.dump(summary, fp, indent=2)

backend.close()
summary
