In [None]:
!pip install dataquality sklearn

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sentence_transformers import SentenceTransformer
import dataquality as dq


In [None]:
# Load dataset  
## Loading some samples from public csv imdb sentiment data (Link: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews )
df_train = pd.read_csv('train.csv')
df_train["id"] = list(range(len(df_train)))

df_test = pd.read_csv('test.csv')
df_test["id"] = list(range(len(df_test)))

# Train data
train_data = df_train['review']
train_labels  = df_train['sentiment']

# Test data
test_data = df_test['review']
test_labels  = df_test['sentiment']

# Load pre-trained sentence transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Encode train and test data
train_embeddings = model.encode(train_data, show_progress_bar=True)
test_embeddings = model.encode(test_data, show_progress_bar=True)


In [None]:
dq.set_console_url("my_console_url")
dq.login()

# When integrating with Galileo, you have 3 options

## Option 1 (Simple and fast)
In option 1, you can simply train your classifier, log the final probabilities, and upload. This is quick and easy, and will provide many valuable insights (as long as your model was well trained).

We fit the model, log the probabilities of the model, and log the embeddings from sentence_transformers to Galileo

In [None]:
# Train MLP classifier on encoded data
NUM_ITERS = 100
classifier = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=NUM_ITERS)
classifier.fit(train_embeddings, train_labels)

dq.init("text_classification", project_name="sklearn-example", run_name="option_1")

# Change these depending on your dataframe columns
text_col = "review"
label_col = "sentiment"
dq.log_dataset(
    df_train, split="training", text=text_col, label=label_col
)
dq.log_dataset(
    df_test, split="test", text=text_col, label=label_col
)

train_probs = classifier.predict_proba(train_embeddings)
test_probs = classifier.predict_proba(test_embeddings)

dq.log_model_outputs(
    embs=train_embeddings, 
    probs=train_probs, 
    ids=df_train["id"].tolist(),
    split="train",
    epoch=0
)
dq.log_model_outputs(
    embs=test_embeddings, 
    probs=test_probs, 
    ids=df_test["id"].tolist(),
    split="test",
    epoch=0
)

dq.set_labels_for_run(sorted(df_train[label_col].unique()))
dq.finish(create_data_embs=False)


## Option 2 (Better embeddings)

In this option, we extract out the final hidden layer before the classification and use those embeddings.

Because you are freezing the BERT embeddings layer, these won't be as tuned to your data as the embeddings from HuggingFace for example (which fine-tunes BERTs layers), but these will adapt slightly to your data. So the embeddings in the UI will be slightly more tuned to your inputs.

We add a simple function to extract those layers, and everything else it the same as above

In [None]:
from sklearn.neural_network._base import ACTIVATIONS

def get_deepest_embeddings(classifier: MLPClassifier, inp: np.ndarray) -> np.ndarray:
    """Returns the deepest embeddings for an MLP classifier"""
    data = inp.copy()
    # Pass through the hidden layers
    num_hidden_layers = len(classifier.hidden_layer_sizes)
    for layer in range(num_hidden_layers):
        data = np.matmul(data, classifier.coefs_[layer]) + classifier.intercepts_[layer]
        # We don't want to run the activation over the final layer of embeddings for logging
        if layer != num_hidden_layers-1:
            ACTIVATIONS[classifier.activation](data)
    return data

In [None]:
# Train MLP classifier on encoded data
NUM_ITERS = 100
classifier = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=NUM_ITERS)
classifier.fit(train_embeddings, train_labels)

dq.init("text_classification", project_name="sklearn-example", run_name="option_2")

# Change these depending on your dataframe columns
text_col = "review"
label_col = "sentiment"
dq.log_dataset(
    df_train, split="training", text=text_col, label=label_col
)
dq.log_dataset(
    df_test, split="test", text=text_col, label=label_col
)

train_probs = classifier.predict_proba(train_embeddings)
test_probs = classifier.predict_proba(test_embeddings)

# Here we extract the model's embeddings and log those instead
model_train_embs = get_deepest_embeddings(classifier, train_embeddings)
model_test_embs = get_deepest_embeddings(classifier, test_embeddings)

dq.log_model_outputs(
    embs=model_train_embs, 
    probs=train_probs, 
    ids=df_train["id"].tolist(),
    split="train",
    epoch=0
)
dq.log_model_outputs(
    embs=model_test_embs, 
    probs=test_probs, 
    ids=df_test["id"].tolist(),
    split="test",
    epoch=0
)

dq.set_labels_for_run(sorted(df_train[label_col].unique()))
dq.finish(create_data_embs=False)


## Option 3 (Better probability analysis)

In this final option, we don't simply train the classifier for `NUM_ITERS`. Instead, we train it manually in a for-loop, and log the probabilities at every set interval. This will allow Galileo to build a deeper analysis of your model's understanding, and provide better insights into Data Error Potential (DEP) and other algorithms.

This is the most complex, but can potentially provide the "last mile" results to find hidden data issues.

Much of the code remains similar. Comments are available to explain the changes

In [None]:
from tqdm.auto import tqdm

NUM_ITERS = 100
# How frequently we log to Galileo. You can tune this and NUM_ITERS together for maximum control
LOG_ITER_SIZE = 25
# We set max_iter to 1, as we will be manually looping and partially training
# We also set warm_start to True so we can continue to learn
classifier = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1, warm_start=True)
classifier.fit(train_embeddings, train_labels)

dq.init("text_classification", project_name="sklearn-example", run_name="option_3")

# Input data logging remains the same
text_col = "review"
label_col = "sentiment"
dq.log_dataset(
    df_train, split="training", text=text_col, label=label_col
)
dq.log_dataset(
    df_test, split="test", text=text_col, label=label_col
)

epoch = 0
for it in tqdm(range(NUM_ITERS)):
    classifier.partial_fit(train_embeddings, train_labels)
    # Log every LOG_ITER_SIZE iters, and on the final train iteration
    if it % LOG_ITER_SIZE == 0 or it == NUM_ITERS-1:
        print("Logging for epoch", epoch)
        dq.set_epoch(epoch)
        train_probs = classifier.predict_proba(train_embeddings)
        test_probs = classifier.predict_proba(test_embeddings)

        # Again, we extract the model's embeddings and log those instead
        model_train_embs = get_deepest_embeddings(classifier, train_embeddings)
        model_test_embs = get_deepest_embeddings(classifier, test_embeddings)

        dq.log_model_outputs(
            embs=model_train_embs, 
            probs=train_probs, 
            ids=df_train["id"].tolist(),
            split="train",
        )
        dq.log_model_outputs(
            embs=model_test_embs, 
            probs=test_probs, 
            ids=df_test["id"].tolist(),
            split="test",
        )
        epoch += 1

dq.set_labels_for_run(sorted(df_train[label_col].unique()))
dq.finish(create_data_embs=False)
