## Sequence Classification for anonymization

This notebook is an experiment to classify key-value fields from JSON as being anonymized or not for PII data.

First, export functions to generate training and test data

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from tab_exp.tab import generate_synth_data, PIIData
from tab_exp.viz import model_choice, dataset_choice, get_hf_model

do_train, new_data, model_name = model_choice()

## Get Tokenizer 

The first step is to create our model name and get a tokenizer to tokenize our training data

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification
import torch
import polars as pl
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

selected_model = str(model_name.value)
use_checkpoint = True if "finetuned" in selected_model else False
print(f"Using checkpoint = {use_checkpoint}")

model_id = get_hf_model(selected_model)
print(f"Using model {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The llama3 tokenizer doesn't do padding like other models.  So set them as End of Sequence
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

## Create dataset

We will use the generate_synth_data to create datasets for training, testing, and validation

In [None]:
def data_p(p: str): return f"../notebooks/samples_{p}/combined/combined.jsonl"

if new_data.value:
    sd_train = generate_synth_data(samples=1000, output="samples_train", clean=True)
    train_path = sd_train["combined_path"]
    sd_test = generate_synth_data(samples=300, output="samples_test", clean=True)
    test_path = sd_test["combined_path"]
    sd_validate = generate_synth_data(samples=200, output="samples_validate", clean=True)
    validate_path = sd_validate["combined_path"]
else:
    train_path = data_p("train")
    test_path = data_p("test")
    validate_path = data_p("validate")

ds_train = load_dataset("json", data_files=train_path)
ds_test = load_dataset("json", data_files=test_path)
ds_validate = load_dataset("json", data_files=validate_path)

In [None]:
from typing import cast

ds_train: Dataset = cast(Dataset, ds_train)
ds_validate: Dataset = cast(Dataset, ds_validate)
ds_test: Dataset = cast(Dataset, ds_test)
dataset = DatasetDict({
    "train": ds_train["train"],
    "validate": ds_validate["train"],
    "test": ds_test["train"]
})
dataset

## Create dataframe of test data

Create the dataframe to make it more convenient

In [None]:
ds_to_use = dataset_choice()

In [None]:
use_dataset = str(ds_to_use.value)
print(f"Using dataset {use_dataset}")
pl_df = pl.read_ndjson(data_p(use_dataset))

df = pl_df[:8000].clone().to_pandas()
# df2 = pd_df[8000:16000]
sentences = df.text.to_list()
df

## Quantize for efficiency

For experimentation on a small computer, we need to quantize the weights to make them smaller to trade accuracy for
computational speed.  Without this, we either will not be able to fit the model weights at all into memory, or it will
take forever to finish

In [7]:
if not torch.cuda.is_available():
    raise Exception("GPU must be available for training")

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.bfloat16 
)

## Create an optimized model for finetuning

Create an optimized model that we can use for fine tuning on small hardware

In [None]:
print(f"Using model {selected_model}")

# Create a model for text classification.  Normally llama3 is used for CausalLLM (question/answer)
model = AutoModelForSequenceClassification.from_pretrained(
    selected_model,
    quantization_config=quantization_config,
    num_labels=4,
    device_map='auto'
)

## Use LoRA to train only a subset of the weights

Fine tuning all the weights of the checkpoint would be too prohibitive.  So we will use LoRA to train only a subset

In [9]:
# I have no idea why the jupyter notebook lsp says it can't find the symbol for these exports.  This will work
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model # type: ignore

lora_config = LoraConfig(
    r = 16, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

In [10]:
# The model is now optimized to make training faster, if a little less accurate
def init_model_for_training(model, train: bool):
    if train:
        print("Configuring model for training")
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, lora_config)
        # set some llama3 tokenizer specific settings
        model.config.use_cache = False  # type: ignore
        model.config.pretraining_tp = 1 # type: ignore
    else:
        print("Using checkpointed model to get predictions")

    model.config.pad_token_id = tokenizer.pad_token_id  # type: ignore

In [None]:
init_model_for_training(model, do_train.value)

## Make initial prediction

Do a sample run to get tensors

We load the raw text data into a list of str and then batched up into sublists.  The sublists are then passed to the 
tokenize function so that it is in the format that pytorch can use.

This `inputs` value is then passed to the model to calculate the logits for each sentence and stored in `all_outputs`.

In [12]:
from peft.peft_model import PeftModel
from peft.mixed_model import PeftMixedModel
import tqdm

def generate_predictions(model: PeftModel | PeftMixedModel, df_test: pd.DataFrame, batch_sz: int = 32):
    sentences = df_test.text.tolist()
    all_outputs = []
    accel = "cuda" if torch.cuda.is_available() else "cpu"

    with tqdm.tqdm(total=len(sentences)) as pbar:
        for i in range(0, len(sentences), batch_sz):
            batch_sentences = sentences[i:i + batch_sz]
            # Encode the sentcences with the tokenizer.  Each LLM has its own tokenizer and config
            inputs = tokenizer(batch_sentences, 
                               return_tensors="pt", 
                               padding=True, truncation=True, 
                               max_length=512)
            inputs = {k: v.to(accel) for k, v in inputs.items()}
            # Don't actually do backprop with grad descent.  We just want a prediction
            with torch.no_grad():
                outputs = model(**inputs)
                all_outputs.append(outputs['logits'])
            pbar.update(batch_sz)
    return torch.cat(all_outputs, dim=0)

## Create predictions column

Generate predictions and add a predictions column to the test data datafrane

In [None]:
outputs = generate_predictions(model, df)
df["predictions"] = outputs.argmax(dim=1).cpu().numpy()
df

## Evaluate performance

We need a way to evaluate the performance.  By default HF will use 

In [14]:
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

def get_metrics_result(test_df: pd.DataFrame):
    y_test = test_df.label
    y_pred = test_df.predictions

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    return {"classification_report": classification_report(y_test, y_pred),
            "balanced_accuracy_score": balanced_accuracy_score(y_test, y_pred),
            "accuracy_score": accuracy_score(y_test, y_pred)}

def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy': balanced_accuracy_score(predictions, labels),
            'accuracy':accuracy_score(predictions,labels)}

## Calculate the metrics

Calculate metrics with the stock LLM before fine tuning

In [None]:
get_metrics_result(df)

## Tokenize the dataset

We need to convert the natural language in the dataset to the embeddings needed by the LLM

In [None]:
from transformers import DataCollatorWithPadding

# function that will be applied to the testing data.  We need to tokenize it for training
def tokenize_fn(data: PIIData):
    return tokenizer(data['text'], truncation=True, max_length=512)

tokenized_ds = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_ds.set_format("torch")

# pad the batch of inputs to a length equal to the maximum input length in that batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

## Create a custom trainer

We will create a custom trainer.  Importantly, we define a way to calculate loss for back propagation

In [None]:
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = (torch.tensor(class_weights, dtype=torch.float32)
                                  .to(self.args.device))
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

## Create training args



In [None]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    logging_steps=1,
    weight_decay = 0.01,
    eval_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to="none"
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['validate'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=None
)

## Perform training

Actually fine tune the model by training it on the training data

In [None]:
train_result = trainer.train()

In [None]:
from tab_exp.viz import get_finetuned_name
finetuned_name = get_finetuned_name(selected_model)
print(f"Saving checkpoint as {finetuned_name}")
model.save_pretrained(finetuned_name)

## Generate new predictions

Now that we have a newly trained model, run new predictions

In [None]:
outputs2 = generate_predictions(model, df)
df["predictions"] = outputs2.argmax(dim=1).cpu().numpy()

In [None]:
pl.Config.set_tbl_width_chars(200)
pl.Config.set_fmt_str_lengths(80)
pl_df = pl.from_pandas(df)
pl_df

## Calculate new metrics with fine tuned model

Now that we have the trained weight checkpoint, run the  
get_metrics_result again and see what the performance is like

In [None]:
get_metrics_result(df)