# Simple tabular data demo

This is a very basic demo of how to log tabular datasets for classification tasks with XGBoost.

In [None]:
import os

# os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"
# os.environ["GALILEO_USERNAME"]="user@example.com"
# os.environ["GALILEO_PASSWORD"]="Th3secret_"

os.environ['GALILEO_CONSOLE_URL']="https://console.dev.rungalileo.io"
os.environ["GALILEO_USERNAME"]="galileo@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="A11a1una!"

import dataquality as dq
dq.configure()

# run_name = "fine-wine"
run_name = "iris-uris-weallris"

dq.init("tabular_classification", "tabular-project", run_name)

## 1. Load data

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.datasets import load_iris, load_wine

if run_name == "fine-wine":
    X, y = load_wine(as_frame=True, return_X_y=True)
else:
    X, y = load_iris(as_frame=True, return_X_y=True)

# When exporting to hdf5 you can't have col names containing forward slash
X.rename(lambda x: x.replace("/", "-"), axis="columns", inplace=True)
X.rename(lambda x: x.replace("(", ""), axis="columns", inplace=True)
X.rename(lambda x: x.replace(")", ""), axis="columns", inplace=True)


def understand_dataset(dataset) -> None:
    for elem in dir(dataset):
        print(elem, type(getattr(dataset, elem)))

    print(dataset.shape, y.shape)

## 2. Create and fit model on training data

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X, y)

## 3. Set labels for run

In [None]:
target_names = load_wine().target_names
dq.set_labels_for_run(target_names)

## 4. Log data!

In [None]:
dq.log_xgboost(
    model=xgb_model,
    X=X,
    y=y,
    split="training"
)

In [None]:
dq.log_xgboost(
    model=xgb_model,
    X=X,
    y=y,
    split="test"
)

## 5. Call finish to start processing

In [None]:
dq.finish()

## Feature Pair Accuracy Matrix

In [None]:
# TODO

## Feature Importance

In [None]:
xgb_model.feature_importances_

## Feature Correlation Matrix

In [None]:
import vaex

df = vaex.from_pandas(X)
df.correlation(x=df.get_column_names())