In [18]:
import sys
from pathlib import Path

proj_root = Path().resolve().parents[0]

sys.path.append(str(proj_root))
print(*sys.path, sep="\n")

/usr/lib/python312.zip
/usr/lib/python3.12
/usr/lib/python3.12/lib-dynload

/home/ryanwtsai/repos/ml_ai_portfolio/notes/example_code/venv/lib/python3.12/site-packages
/home/ryanwtsai/repos/ml_ai_portfolio/titanic
/home/ryanwtsai/repos/ml_ai_portfolio/titanic


In [78]:
from importlib import reload
import src
reload(src.utils)
reload(src.custom_transformers)
reload(src.custom_model_selection)

<module 'src.custom_model_selection' from '/home/ryanwtsai/repos/ml_ai_portfolio/titanic/src/custom_model_selection.py'>

In [61]:
from src.utils import data_paths, find_project_root, load_titanic_data

print(find_project_root())
data = load_titanic_data()
# print(data["train"].info())
# print(data["test"].info())

/home/ryanwtsai/repos/ml_ai_portfolio/titanic


In [64]:
from sklearn.pipeline import Pipeline
from src.custom_transformers import FamilySizeExtractor, TitleExtractor, DeckExtractor, SexPclassAgeExtractor

def feature_extraction():
    pipe = Pipeline([
        ("fam", FamilySizeExtractor()),
        ("title", TitleExtractor()),
        ("deck", DeckExtractor()),
        ("sexpclassage", SexPclassAgeExtractor()),
    ])

    data = load_titanic_data()
    data["train"] = pipe.transform(data["train"])
    data["test"] = pipe.transform(data["test"])

    root_dir = find_project_root()
    data["train"].to_csv(Path(root_dir) / data_paths["train_extracted"], index=False)
    data["test"].to_csv(Path(root_dir) / data_paths["test_extracted"], index=False)

feature_extraction()

In [69]:
from src.custom_transformers import feature_extraction

feature_extraction()

In [80]:
from src.custom_model_selection import make_stratified_k_fold_with_custom_strata, custom_cross_validate
from src.utils import load_titanic_data

data = load_titanic_data(extracted=True)
cv_iter = make_stratified_k_fold_with_custom_strata(data["train"], ["Title", "Survived"])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from src.custom_transformers import DynamicDataPrepPipeline

ml_pipe = Pipeline([
    ("data_prep", DynamicDataPrepPipeline(extract_title=True)),
    ("model", RandomForestClassifier(n_estimators=20, random_state=0))
])

df_results, metrics = custom_cross_validate(ml_pipe, data["train"].drop(columns="Survived"), data["train"]["Survived"], cv=cv_iter)
metrics



In [104]:
import yaml

config_path = proj_root / Path("config")
# yaml.safe_load(next(config_path.glob("*.yaml")))
with open(next(config_path.glob("*.yaml")), 'r') as file:
    data = yaml.safe_load(file)

data

{'experiment_name': 'rf_gridsearch_001',
 'task_type': 'grid_search',
 'pipeline_kwargs': {'use_title': True, 'use_fare': True, 'model': 'rf'},
 'param_grid': {'clf__n_estimators': [100, 200], 'clf__max_depth': [3, 5, 10]},
 'cv': {'n_splits': 5, 'shuffle': True, 'random_state': 42},
 'scoring': 'accuracy'}

In [111]:
from src.custom_transformers import DynamicDataPrepPipeline
from sklearn.ensemble import RandomForestRegressor

data_prep_pipe = DynamicDataPrepPipeline(
    ordinal_columns={"Sex"},
    numeric_columns={"Age", "Pclass", "Fare"},
    extract_title=True,
    extract_fam=True,
    extract_deck=True,
    extract_sexpclassage=True,
    age_imputer_model=RandomForestRegressor(max_depth=10, random_state=0),
    impute_age_kwargs={"add_indicator": True},
)

params = data_prep_pipe.get_params(deep=False)
# model = RandomForestClassifier(max_depth=20, random_state=0)

In [112]:
params

{'age_imputer_model': RandomForestRegressor(max_depth=10, random_state=0),
 'deck_kwargs': None,
 'extract_deck': True,
 'extract_fam': True,
 'extract_sexpclassage': True,
 'extract_title': True,
 'fam_kwargs': None,
 'impute_age_kwargs': {'add_indicator': True},
 'numeric_columns': {'Age', 'Fare', 'Pclass'},
 'numeric_transformations': None,
 'onehot_columns': None,
 'onehot_transformations': None,
 'ordinal_columns': {'Sex'},
 'ordinal_transformations': None,
 'sexpclassage_kwargs': None,
 'title_kwargs': None}

In [113]:
DynamicDataPrepPipeline(**params)