## Environment Preparation

* Load the SDK code from the local package directory
* Load the API key and secret in the .env file

In [1]:
# Quick hack to load local SDK code
import os

os.chdir(os.path.join(os.getcwd(), ".."))

# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()

True

## ValidMind SDK Introduction

In [2]:
import pandas as pd
import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
# Initialize ValidMind SDK
import validmind as vm

# For test environment use api_host="https://api.test.vm.validmind.ai/api/v1/tracking"
vm.init(project="cl2r3k1ri000009jweny7ba1g")

True

In [5]:
df = pd.read_pickle("notebooks/datasets/_temp/df_loans_cleaned.pickle")

targets = vm.DatasetTargets(
    target_column="loan_status",
    class_labels={
        "Fully Paid": "Fully Paid",
        "Charged Off": "Charged Off",
    }
)

vm_dataset = vm.log_dataset(df, "training", analyze=True, targets=targets)

True

In [8]:
results = vm.run_dataset_tests(df, target_column="loan_status", dataset_type="training", vm_dataset=vm_dataset, send=True)

Running data quality tests for "training" dataset...



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 74.72it/s]


Test suite has completed.
Sending results to ValidMind...





Successfully logged test results for test: class_imbalance
Successfully logged test results for test: duplicates
Successfully logged test results for test: cardinality
Successfully logged test results for test: missing
Successfully logged test results for test: pearson_correlation
Successfully logged test results for test: skewness
Successfully logged test results for test: zeros

Summary of results:

Test                 Passed      # Passed    # Errors    % Passed
-------------------  --------  ----------  ----------  ----------
class_imbalance      True               1           0         100
duplicates           False              0           1           0
cardinality          False             14           7     66.6667
missing              False             25          53     32.0513
pearson_correlation  False              0          10           0
skewness             False              3           6     33.3333
zeros                False              1           3          25



In [7]:
train_ds, val_ds = train_test_split(df, test_size=0.20)

x_train = train_ds.drop("loan_status", axis=1)
x_val = val_ds.drop("loan_status", axis=1)
y_train = train_ds.loc[:, "loan_status"].astype(str)
y_val = val_ds.loc[:, "loan_status"].astype(str)

In [None]:
xgb_model = xgb.XGBClassifier(early_stopping_rounds=10)
xgb_model.fit(
    x_train,
    y_train,
    eval_set=[(x_val, y_val)],
    verbose=False,
)

In [None]:
y_pred = xgb_model.predict_proba(x_val)[:, -1]
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_val, predictions)

print(f"Accuracy: {accuracy}")

In [None]:
vm.log_model(xgb_model)