## Sequence Classification for anonymization

This notebook is an experiment to classify key-value fields from JSON as being anonymized or not for PII data.

The first step is to create our model name and get a tokenizer to tokenize our training data

In [None]:
from transformers import AutoTokenizer

# Unfortunately, it appears huggingfaces doesn't fully support 3.1 for text classification in the 
# AutoModelForSequenceClassification yet.  So we will just use 3.0 for now
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# The llama3 tokenizer doesn't do padding like other models.  So set them as End of Sequence
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

## Import tab_exp module

We need the tab_exp module to generate the synthetic test data

In [None]:
from datasets import load_dataset

import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from tab_exp.tab import generate_synth_data, PIIData

## Create dataset

We will use the generate_synth_data to create datasets for training, testing, and validation

In [None]:
from datasets import DatasetDict

sd_train = generate_synth_data(samples=1000, output="samples_train", clean=True)
ds_train = load_dataset("json", data_files=sd_train["combined_path"])

sd_test = generate_synth_data(samples=300, output="samples_test", clean=True)
ds_test = load_dataset("json", data_files=sd_test["combined_path"])

sd_validate = generate_synth_data(samples=200, output="samples_validate", clean=True)
ds_validate = load_dataset("json", data_files=sd_validate["combined_path"])

In [None]:
dataset = DatasetDict({
    "train": ds_train["train"],
    "validate": ds_validate["train"],
    "test": ds_test["train"]
})
dataset

## Quantize for efficiency

For experimentation on a small computer, we need to quantize the weights to make them smaller to trade accuracy for
computational speed.  Without this, we either will not be able to fit the model weights at all into memory, or it will
take forever to finish

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
import torch

if not torch.cuda.is_available():
    raise Exception("GPU must be available for trainin")

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.bfloat16 
)

# Create a model for text classification.  Normally llama3 is used for CausalLLM (question/answer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    num_labels=4,
    device_map='auto'
)

## Use LoRA to train only a subset of the weights

Fine tuning all the weights of the checkpoint would be too prohibitive.  So we will use LoRA to train only a subset

In [None]:
# I have no idea why the jupyter notebook lsp says it can't find the symbol for these exports.  This will work
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model # type: ignore

lora_config = LoraConfig(
    r = 16, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

# The model is now optimized to make training faster, if a little less accurate
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# set some llama3 tokenizer specific settings
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
import polars as pl 

df = pl.read_ndjson(sd_train["combined_path"])
df = df.to_pandas()
df = df[:8000]
sentences = df.text.to_list()
len(df)

## Do a sample run to get tensors

We load the raw text data into a list of str and then batched up into sublists.  The sublists are then passed to the 
tokenize function so that it is in the format that pytorch can use.

This `inputs` value is then passed to the model to calculate the logits for each sentence and stored in `all_outputs`.

In [None]:
import json
from typing import Any

accel = "cuda" if torch.cuda.is_available() else "cpu"

batch_size = 32

all_outputs = []

for i in range(0, len(sentences), batch_size):
    batched_inputs = sentences[i:i + batch_size]
    
    inputs = tokenizer(batched_inputs, truncation=True, padding=True, return_tensors="pt", max_length=512)
    inputs = {k: v.to(accel) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])
    print(i)


final_output = torch.cat(all_outputs, dim=0)


In [None]:
df['predictions'] = final_output.argmax(dim=1).cpu().numpy()
df

## Evaluate performance

We need a way to evaluate the performance.  By default HF will use 

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report
import pandas as pd

def get_metrics_result(test_df: pd.DataFrame):
    y_test = test_df.label
    y_pred = test_df.predictions

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))


get_metrics_result(df)

## Tokenize the dataset

We need to convert the natural language in the dataset to the embeddings needed by the LLM

In [None]:
from transformers import DataCollatorWithPadding

# function that will be applied to the testing data.  We need to tokenize it for training
def tokenize_fn(data: PIIData):
    return tokenizer(data['text'], truncation=True, max_length=512)

tokenized_ds = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_ds.set_format("torch")

# pad the batch of inputs to a length equal to the maximum input length in that batch
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

## Create a custom trainer

We will create a custom trainer

In [None]:
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

## Create training args



In [None]:
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),
    'accuracy':accuracy_score(predictions,labels)}

training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    logging_steps=1,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to="none"
)

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['validate'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=None
)

In [None]:
import numpy
import numpy as np
train_result = trainer.train()

## Generate new predictions



In [None]:
model.save_pretrained("finetuned")

In [None]:
from peft.peft_model import PeftModel
from peft.mixed_model import PeftMixedModel

def generate_predictions(
    model: PeftModel | PeftMixedModel, 
    df_test: pd.DataFrame,
    batch_sz: int = 32
):
    sentences = df_test.text.tolist()
    batch_size = batch_sz
    all_outputs = []
    accel = "cuda" if torch.cuda.is_available() else "cpu"

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]

        inputs = tokenizer(batch_sentences, 
                           return_tensors="pt", 
                           padding=True, 
                           truncation=True, 
                           max_length=512)

        inputs = {k: v.to(accel) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs['logits'])
        print(i)
        
    final_outputs = torch.cat(all_outputs, dim=0)
    df_test['predictions'] = final_outputs.argmax(dim=1).cpu().numpy()

#test_df = df[1000:2000]
generate_predictions(model, df)

In [None]:
pl.Config.set_tbl_width_chars(200)
pl.Config.set_fmt_str_lengths(80)
pl.from_pandas(df)