# This is demonstration of loading configuration and dataset access

This demo demonstrate access for local and DataBricks files


In [1]:
import os
from pathlib import Path

# Set the working directory to the project root (assuming the notebook is in notebooks/)
project_dir = Path.cwd().parent  # Assumes notebook is in project_root/notebooks/
os.chdir(project_dir)
print(f"Working directory set to: {os.getcwd()}")

import pandas as pd
import ml_workbench

Working directory set to: /Users/staskh/Pheno/ml_workbench


# Demo for local Datasets

Lets use local configuration from tests/data/datasets_combined.yaml

It also demonstrate defining "base_dir" placeholder that has no default value

In [2]:
local_cfg = ml_workbench.YamlConfig(
    "tests/data/datasets_combined.yaml", base_dir="tests/data/"
)

# Get list of dataset names and generate statistics for each
dataset_names = local_cfg.get_datasets_list()
print(f" We have following datasets: {dataset_names}")

 We have following datasets: ['csv_one', 'csv_two', 'csv_user_id', 'local_combined_dataset', 'test_users', 'test_users_metadata', 'databricks_combined_left_join', 'databricks_combined_inner_join']


Now, lets look inside of primitive dataset, such as csv_one

In [3]:
csv_one_df = ml_workbench.Dataset("csv_one", local_cfg).read_pandas()
csv_one_df.head()

Unnamed: 0,id,value
0,1,10
1,2,20
2,3,30


In [4]:
csv_two_df = ml_workbench.Dataset("csv_user_id", local_cfg).read_pandas()
csv_two_df.head()

Unnamed: 0,id,user_id_tmp
0,1,u1
1,2,u2
2,3,u3
3,4,u4
4,5,u5


In [5]:
csv_two_df = ml_workbench.Dataset("csv_two", local_cfg).read_pandas()
csv_two_df.head()

Unnamed: 0,user_id,score,flag
0,u1,0.5,True
1,u2,0.0,False


Now, lets look inside of combined dataset, such as combined_dataset


In [6]:
combined_df = ml_workbench.Dataset("local_combined_dataset", local_cfg).read_pandas()
combined_df.head()

Unnamed: 0,id,value,user_id_tmp,user_id,score,flag
0,1,10,u1,u1,0.5,True
1,2,20,u2,u2,0.0,False
2,3,30,u3,,,


# Demo for DataBricks Datasets
**NOTE:** if you run localy, you MUST to have Databricks Connect enabled and cluster initialized

In [7]:
# Load configuration
cfg = ml_workbench.YamlConfig(
    "tests/data/datasets_combined.yaml", base_dir="tests/data/"
)

# Get list of dataset names and generate statistics for each
dataset_names = cfg.get_datasets_list()
print(dataset_names)

['csv_one', 'csv_two', 'csv_user_id', 'local_combined_dataset', 'test_users', 'test_users_metadata', 'databricks_combined_left_join', 'databricks_combined_inner_join']


Read User profiles (stored in DataBricks table)

In [8]:
# read test_users
ds = ml_workbench.Dataset("test_users", cfg)
df = ds.read_pandas()
df.head(10)

Unnamed: 0,participant_id,participant_uuid
0,3035,2bb6ab78-6556-4532-b2a9-fdf7c2ba1fb4
1,4249,e0ce56df-7828-4875-988e-1a8c4989e555
2,9031,43096f90-5933-40be-95c6-319c9377719d
3,1436,04c44d0e-e5f9-4916-a7fe-24cce39b5b47
4,4252,d56c2fa7-b8cb-4989-8d6f-879485e52fbf
5,3919,f1d3e1dd-3c2f-4121-98eb-5b544b569c5b
6,9001,a2ef181b-592e-48d8-9f92-0f17164145d4
7,3628,1e94efd4-7d41-4e32-8628-b1f3fcd1d726
8,9702,e05a3b88-d9de-4843-9a72-3d4163775a26
9,9243,23f52b61-a548-4edf-a7d7-2383d697015c


Read User metadata (stored in DataBricks table)

In [9]:
ds = ml_workbench.Dataset("test_users_metadata", cfg)
df = ds.read_pandas()
df.head(10)

Unnamed: 0,participant_uuid,A,B,C
0,2bb6ab78-6556-4532-b2a9-fdf7c2ba1fb4,45,49.71,UgmusAjb
1,e0ce56df-7828-4875-988e-1a8c4989e555,47,3.56,mNbVWlFh
2,43096f90-5933-40be-95c6-319c9377719d,85,70.66,jQXflVxs
3,04c44d0e-e5f9-4916-a7fe-24cce39b5b47,47,33.52,DnhdloaQ
4,d56c2fa7-b8cb-4989-8d6f-879485e52fbf,46,60.37,WyMMEpZG


Now lets look into LEFT join results

In [10]:
ds = ml_workbench.Dataset("databricks_combined_left_join", cfg)
df = ds.read_pandas()
df.head(10)

Unnamed: 0,participant_id,participant_uuid,A,B,C
0,3035,2bb6ab78-6556-4532-b2a9-fdf7c2ba1fb4,45.0,49.71,UgmusAjb
1,4249,e0ce56df-7828-4875-988e-1a8c4989e555,47.0,3.56,mNbVWlFh
2,9031,43096f90-5933-40be-95c6-319c9377719d,85.0,70.66,jQXflVxs
3,1436,04c44d0e-e5f9-4916-a7fe-24cce39b5b47,47.0,33.52,DnhdloaQ
4,4252,d56c2fa7-b8cb-4989-8d6f-879485e52fbf,46.0,60.37,WyMMEpZG
5,3919,f1d3e1dd-3c2f-4121-98eb-5b544b569c5b,,,
6,9001,a2ef181b-592e-48d8-9f92-0f17164145d4,,,
7,3628,1e94efd4-7d41-4e32-8628-b1f3fcd1d726,,,
8,9702,e05a3b88-d9de-4843-9a72-3d4163775a26,,,
9,9243,23f52b61-a548-4edf-a7d7-2383d697015c,,,


And now try INNER join

In [11]:
ds = ml_workbench.Dataset("databricks_combined_inner_join", cfg)
df = ds.read_pandas()
df.head(10)

Unnamed: 0,participant_id,participant_uuid,A,B,C
0,3035,2bb6ab78-6556-4532-b2a9-fdf7c2ba1fb4,45,49.71,UgmusAjb
1,4249,e0ce56df-7828-4875-988e-1a8c4989e555,47,3.56,mNbVWlFh
2,9031,43096f90-5933-40be-95c6-319c9377719d,85,70.66,jQXflVxs
3,1436,04c44d0e-e5f9-4916-a7fe-24cce39b5b47,47,33.52,DnhdloaQ
4,4252,d56c2fa7-b8cb-4989-8d6f-879485e52fbf,46,60.37,WyMMEpZG
