# Simple structured data demo

This is a very basic demo of how to log structured datasets for classification tasks with XGBoost.

In [1]:
import os

# os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"
# os.environ["GALILEO_USERNAME"]="user@example.com"
# os.environ["GALILEO_PASSWORD"]="Th3secret_"

os.environ['GALILEO_CONSOLE_URL']="https://console.dev.rungalileo.io"
os.environ["GALILEO_USERNAME"]="galileo@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="A11a1una!"

import dataquality as dq
dq.configure()

# run_name = "fine-wine"
run_name = "iris-uris-weallris"

dq.init("structured_classification", "structured-elliott", run_name)



📡 https://console.dev.rungalileo.io
🔭 Logging you into Galileo

🚀 You're logged in to Galileo as galileo@rungalileo.io!
✨ Initializing existing public project 'structured-elliott'
🏃‍♂️ Fetching existing run 'iris-uris-weallris'
🛰 Connected to existing project 'structured-elliott', and existing run 'iris-uris-weallris'.




## 1. Load data

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.datasets import load_iris, load_wine

if run_name == "fine-wine":
    X, y = load_wine(as_frame=True, return_X_y=True)
else:
    X, y = load_iris(as_frame=True, return_X_y=True)

# When exporting to hdf5 you can't have col names containing forward slash
X.rename(lambda x: x.replace("/", "-"), axis="columns", inplace=True)
X.rename(lambda x: x.replace("(", ""), axis="columns", inplace=True)
X.rename(lambda x: x.replace(")", ""), axis="columns", inplace=True)


def understand_dataset(dataset) -> None:
    for elem in dir(dataset):
        print(elem, type(getattr(dataset, elem)))

    print(dataset.shape, y.shape)

## 2. Create and fit model on training data

In [3]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X, y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

## 3. Set labels for run

In [4]:
target_names = load_wine().target_names
dq.set_labels_for_run(target_names)

## 4. Log data!

In [5]:
dq.log_xgboost(
    model=xgb_model,
    X=X,
    y=y,
    split="training"
)

In [6]:
dq.log_xgboost(
    model=xgb_model,
    X=X,
    y=y,
    split="test"
)

## 5. Call finish to start processing

In [7]:
dq.finish()

☁️ Uploading Data


Uploading data to Galileo:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/30.0k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights?projectId=ee785245-7d39-47f8-9f31-4cbf5023fb1a&runId=56175074-567e-47db-b578-3f59e0b72e90&split=training&metric=f1&depHigh=1&depLow=0&taskType=4
Waiting for job...
	Saving processed test data
Done! Job finished with status completed
Click here to see your run! https://console.dev.rungalileo.io/insights?projectId=ee785245-7d39-47f8-9f31-4cbf5023fb1a&runId=56175074-567e-47db-b578-3f59e0b72e90&split=training&metric=f1&depHigh=1&depLow=0&taskType=4
🧹 Cleaning up
🧹 Cleaning up


'https://console.dev.rungalileo.io/insights?projectId=ee785245-7d39-47f8-9f31-4cbf5023fb1a&runId=56175074-567e-47db-b578-3f59e0b72e90&split=training&metric=f1&depHigh=1&depLow=0&taskType=4'

## Feature Pair Accuracy Matrix

In [None]:
# TODO

## Feature Importance

In [None]:
xgb_model.feature_importances_

## Feature Correlation Matrix

In [None]:
import vaex

df = vaex.from_pandas(X)
df.correlation(x=df.get_column_names())