# Text Classification using Keras and 🔭 Galileo

In this tutorial, we'll train a model with Tensorflow and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality`
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip
    !pip install -U pip &> /dev/null

    # Install HF datasets for downloading the example datasets
    !pip install -U dataquality datasets transformers &> /dev/null
    
    print('👋 Installed necessary libraries and restarting runtime! This should only need to happen once.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')

    # Restart the runtime
    import os, time
    time.sleep(1) # gives the print statements time to flush
    os._exit(0) # exits without allowing the next cell to run

# 1. Login to Galileo

In [None]:
import dataquality as dq

dq.login()

# 2. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can find more datasets [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:multi-class-classification&sort=downloads).

dataset_name = 'banking77' #@param ["banking77", "emotion", "tweet_eval"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)

# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")

In [None]:
#@markdown Convert HF dataset to Pandas dataframes 
import pandas as pd

def load_pandas_df(data):
  # Find the name of the ground truth column
  good_col_names = [name for name in list(data['train'].features) if "label" in name]
  if len(good_col_names) == 1:
    label_col = good_col_names[0]
  else:
    col_names = list(data['train'].features)
    print(f"The name of the columns are {col_names}.")
    label_col = input(f"🏅 Please enter the name of the column containing the labels: ")
    assert label_col in col_names, f"{label_col} is not an existing column"

  # Load the labels in a dictionary
  labels = data['train'].features[label_col].names
  labels = {v:k for v, k in enumerate(labels)}

  # Load the train data into a frame
  train_data = data["train"]
  train_df = pd.DataFrame.from_dict(train_data)
  train_df['label'] = train_df[label_col].map(labels)
  train_df['id'] = train_df.index

  # Load the test data into a frame
  test_split_name = "validation" if "validation" in data else "test"
  test_data = data[test_split_name]
  test_df = pd.DataFrame.from_dict(test_data)
  test_df['label'] = test_df[label_col].map(labels)
  test_df['id'] = test_df.index

  return train_df, test_df

train_df, test_df = load_pandas_df(data)
labels = train_df.label.unique().tolist()

# 3. Initialize Galileo

In [None]:
# 🔭🌕 Galileo logging
dq.init(task_type="text_classification", 
        project_name="text_classification_keras", 
        run_name=f"example_run_{dataset_name.replace('/','-')}")

# 4. Log input data with Galileo
Input data can be logged via `log_data_samples` (or `log_dataset` for logging iterables). This step will log input samples, gold labels, data split, and list of all labels. You can achieve this by adding 1 line of code to the standard PyTorch Dataset Class.

In [None]:
from transformers import AutoTokenizer
import tensorflow as tf


# 🔭🌕 Galileo logging
dq.log_dataset(train_df, split="training")
dq.log_dataset(test_df, split="test")
dq.set_labels_for_run(labels)

# Tokenize inputs and get attention mask
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
BATCH_SIZE = 32

train_df, test_df = train_df.dropna(), test_df.dropna()

# Train and test datasets
datasets = []
for df in [train_df, test_df]:
  inputs = tokenizer(df.text.to_list(), truncation=True, padding=True)
  # Get our label index for model training
  inputs["label"] = [labels.index(label) for label in df.label]
  inputs["uuid"] = df["id"]
  dataset = tf.data.Dataset.from_tensor_slices(dict(inputs)).batch(BATCH_SIZE)
  datasets.append(dataset)

train_ds, test_ds = datasets

# 5. Log Keras model data with Galileo

Model data can be logged via a `DataQualityLoggingLayer`. This step will log the model probabilitiy and embeddings. You can achieve this by adding the following line of code to Keras sequential nodel. 

In [None]:
from dataquality.integrations.keras import DataQualityLoggingLayer
from transformers import TFBertModel
from tensorflow import keras

def build_model(max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attn_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    DataQualityLoggingLayer("ids"), # 🌕🔭 Galileo - right after the input layer, separates ids from your regular inputs
    bert = TFBertModel.from_pretrained('distilbert-base-uncased')
    bert_outputs = bert([input_word_ids, attn_mask])
    last_hidden_states = bert_outputs.last_hidden_state
    clf_output = last_hidden_states[:, 0, :] # CLS token 
    DataQualityLoggingLayer("embs") # 🌕🔭 Galileo - place this after the layer you want embeddings to be logged from
    outputs = tf.keras.layers.Dense(len(labels), activation='softmax')(clf_output)
    DataQualityLoggingLayer("probs") # 🌕🔭 Galileo - place this after the layer you want probabilities to be logged from
    model = tf.keras.models.Model(inputs=[input_word_ids, attn_mask], outputs=)
    model.compile(loss="categorical_crossentropy", 
              optimizer="adam", 
              metrics=["accuracy"], 
              run_eagerly=True) # 🌕🔭 Galileo - set run_eagerly to True!!
    return model 

# 6. Putting into Action: Training a Model

We complete the training pipeline by using a standard PyTorch training setup. While training, we log the current `epoch` and `split`. To complete logging, we call `dq.finish()` after training.

In [None]:
from dataquality.integrations.keras import DataQualityCallback

batch_size = 32
epochs = 1

model = build_model()

model.fit(train_ds,
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=[test_ds], 
          callbacks=[DataQualityCallback()]) # 🌕🔭 Galileo

dq.finish() # 🌕🔭 Galileo

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)