# Split digits and labels into separate training and testing data sets

### Determine run parameters

In [1]:
# ----------------- Parameters for interactive development --------------
P = {
    "pipeline.data_lake_root": "/pipeline-outputs/data-lake",
    "task.train_test_ratio": 0.7,
}

In [2]:
# - During automated runs parameters will be injected in the below cell -

In [3]:
# Parameters
P = {
    "pipeline.data_lake_root": "/pipeline-outputs/data-lake",
    "pipeline.run_environment": "ci",
    "pipeline.pipeline_run_id": "a1ba953f-9537-457d-9a41-ae0713378d2d",
    "pipeline.github.repository": "pynb-dag-runner/mnist-digits-demo-pipeline",
    "pipeline.github.workflow": "Run automated tests, pipeline and deploy results to static reporting site",
    "pipeline.github.runner_name": "Hosted Agent",
    "pipeline.github.run_id": "2614272362",
    "pipeline.github.actor": "matiasdahl",
    "pipeline.github.job": "run-tests-pipeline-and-persist-pipeline-outputs",
    "pipeline.github.base_ref": "",
    "pipeline.github.head_ref": "",
    "pipeline.github.sha": "f934c58f68028b7bb026059d4e87e6d32a1bbd20",
    "pipeline.github.ref": "refs/heads/development",
    "pipeline.github.ref_type": "branch",
    "pipeline.github.ref_name": "development",
    "pipeline.github.event_name": "schedule",
    "task.train_test_ratio": 0.7,
    "task.notebook": "notebooks/split-train-test.py",
    "task.max_nr_retries": "1",
    "run.retry_nr": "0",
    "task.timeout_s": "None",
    "task.num_cpus": 1,
    "_opentelemetry_traceparent": "00-9a9aa49fd83112cc34c27f1dd20c1827-5c092644295e556d-01",
}


In [4]:
# -----------------------------------------------------------------------

---

### Notebook code

In [5]:
from common.io import datalake_root, read_numpy, write_numpy
from pynb_dag_runner.tasks.task_opentelemetry_logging import PydarLogger

logger = PydarLogger(P)

2022-07-05 06:47:28,714	INFO worker.py:842 -- Connecting to existing Ray cluster at address: 172.17.0.2:6379


## Load and split digits data

In [6]:
X = read_numpy(datalake_root(P) / "raw" / "digits.numpy")
y = read_numpy(datalake_root(P) / "raw" / "labels.numpy")

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=P["task.train_test_ratio"],
    test_size=None,
    stratify=y,
    shuffle=True,
    random_state=1,
)

# assert nr of pixels per image is the same for all image vectors
assert X.shape[1] == X_train.shape[1] == X_test.shape[1]

# assert that the (X, y)-pairs have compatible sizes (for both train and test)
assert X_train.shape[0] == len(y_train)
assert X_test.shape[0] == len(y_test)

# assert that all data is used
assert len(y) == len(y_train) + len(y_test)

In [8]:
logger.log_int("nr_digits_train", len(y_train))
logger.log_int("nr_digits_test", len(y_test))

 - Logging nr_digits_train (int) : 1257
 - Logging nr_digits_test (int) : 540


### Persist training and test data sets to separate files

In [9]:
write_numpy(datalake_root(P) / "train-data" / "digits.numpy", X_train)
write_numpy(datalake_root(P) / "train-data" / "labels.numpy", y_train)

#
write_numpy(datalake_root(P) / "test-data" / "digits.numpy", X_test)
write_numpy(datalake_root(P) / "test-data" / "labels.numpy", y_test)