In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 guardrail-ml==0.0.12 tensorboard
!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q unstructured["local-inference"]==0.7.4 pillow


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from guardrail.client import (
    run_metrics,
    run_simple_metrics,
    create_dataset)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:

from transformers import TrainingArguments


# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    save_steps=100,
    logging_steps=10,
    weight_decay=0.001,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    max_steps=100,
    optim="paged_adamw_32bit",
    fp16=False,  # Disable mixed-precision training
    group_by_length=True
)


In [None]:

# Quantization config
bb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,
    quant_batch_axis=0,
    quant_reduce_range=False
)

In [None]:
from datasets import load_dataset

dataset = load_dataset("vitaliy-sharandin/synthetic-fraud-detection", split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/494M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
len(dataset)

6362620

In [None]:

from sklearn.model_selection import train_test_split

**The reason we have to perform all the computations below is simply because of the use of 'load_datasets'. 'load_dataset' ofteb represents datasets as a list of dictionaries where each distionary represents an individual example. When we load our dataset from hugging face, the entire dataset is loaded with 18,612 examples which are all represented as a dictionary with keys corresponding to different columns in the dataset. Hence using train_test_split is difficult as the function splits the list of dictionaries and not the individual examples themselves. Therefore, it is dividing the list into training and test sets but each set is still a list of dictionaries**

In [None]:
# Initialize lists to store specific fields
steps = []
types = []
amounts = []
name_origs = []
old_balance_orgs = []
new_balance_orgs = []
name_dests = []
old_balance_dests = []
new_balance_dests = []
is_frauds = []
is_flagged_frauds = []

# Iterate through the dataset to extract the fields
for example in dataset:
    steps.append(example["step"])
    types.append(example["type"])
    amounts.append(example["amount"])
    name_origs.append(example["nameOrig"])
    old_balance_orgs.append(example["oldbalanceOrg"])
    new_balance_orgs.append(example["newbalanceOrig"])
    name_dests.append(example["nameDest"])
    old_balance_dests.append(example["oldbalanceDest"])
    new_balance_dests.append(example["newbalanceDest"])
    is_frauds.append(example["isFraud"])
    is_flagged_frauds.append(example["isFlaggedFraud"])


In [None]:

# Split the lists
steps_train, steps_temp = train_test_split(steps, test_size=0.3, random_state=42)
steps_validation, steps_test = train_test_split(steps_temp, test_size=0.5, random_state=42)

types_train, types_temp = train_test_split(types, test_size=0.3, random_state=42)
types_validation, types_test = train_test_split(types_temp, test_size=0.5, random_state=42)

amounts_train, amounts_temp = train_test_split(amounts, test_size=0.3, random_state=42)
amounts_validation, amounts_test = train_test_split(amounts_temp, test_size=0.5, random_state=42)

name_origs_train, name_origs_temp = train_test_split(name_origs, test_size=0.3, random_state=42)
name_origs_validation, name_origs_test = train_test_split(name_origs_temp, test_size=0.5, random_state=42)

old_balance_orgs_train, old_balance_orgs_temp = train_test_split(old_balance_orgs, test_size=0.3, random_state=42)
old_balance_orgs_validation, old_balance_orgs_test = train_test_split(old_balance_orgs_temp, test_size=0.5, random_state=42)

new_balance_orgs_train, new_balance_orgs_temp = train_test_split(new_balance_orgs, test_size=0.3, random_state=42)
new_balance_orgs_validation, new_balance_orgs_test = train_test_split(new_balance_orgs_temp, test_size=0.5, random_state=42)

name_dests_train, name_dests_temp = train_test_split(name_dests, test_size=0.3, random_state=42)
name_dests_validation, name_dests_test = train_test_split(name_dests_temp, test_size=0.5, random_state=42)

old_balance_dests_train, old_balance_dests_temp = train_test_split(old_balance_dests, test_size=0.3, random_state=42)
old_balance_dests_validation, old_balance_dests_test = train_test_split(old_balance_dests_temp, test_size=0.5, random_state=42)

new_balance_dests_train, new_balance_dests_temp = train_test_split(new_balance_dests, test_size=0.3, random_state=42)
new_balance_dests_validation, new_balance_dests_test = train_test_split(new_balance_dests_temp, test_size=0.5, random_state=42)

is_frauds_train, is_frauds_temp = train_test_split(is_frauds, test_size=0.3, random_state=42)
is_frauds_validation, is_frauds_test = train_test_split(is_frauds_temp, test_size=0.5, random_state=42)

is_flagged_frauds_train, is_flagged_frauds_temp = train_test_split(is_flagged_frauds, test_size=0.3, random_state=42)
is_flagged_frauds_validation, is_flagged_frauds_test = train_test_split(is_flagged_frauds_temp, test_size=0.5, random_state=42)

**Below is an attempt to re-incorporate the separated column lists back as objects of dataset.Dataset to unify the test set and validation set where each example contains all required fields and can be used for training and testing the model**

In [None]:
# Create test and validation sets
test_set = []
validation_set = []

for i in range(len(steps_test)):
    example = {
        "step": steps_test[i],
        "type": types_test[i],
        "amount": amounts_test[i],
        "nameOrig": name_origs_test[i],
        "oldbalanceOrg": old_balance_orgs_test[i],
        "newbalanceOrig": new_balance_orgs_test[i],
        "nameDest": name_dests_test[i],
        "oldbalanceDest": old_balance_dests_test[i],
        "newbalanceDest": new_balance_dests_test[i],
        "isFraud": is_frauds_test[i],
        "isFlaggedFraud": is_flagged_frauds_test[i],
    }
    test_set.append(example)

for i in range(len(steps_validation)):
    example = {
        "step": steps_validation[i],
        "type": types_validation[i],
        "amount": amounts_validation[i],
        "nameOrig": name_origs_validation[i],
        "oldbalanceOrg": old_balance_orgs_validation[i],
        "newbalanceOrig": new_balance_orgs_validation[i],
        "nameDest": name_dests_validation[i],
        "oldbalanceDest": old_balance_dests_validation[i],
        "newbalanceDest": new_balance_dests_validation[i],
        "isFraud": is_frauds_validation[i],
        "isFlaggedFraud": is_flagged_frauds_validation[i],
    }
    validation_set.append(example)

In [None]:

# Convert to datasets
from datasets import Dataset

test_dataset = Dataset.from_dict({key: [example[key] for example in test_set] for key in test_set[0]})
validation_dataset = Dataset.from_dict({key: [example[key] for example in validation_set] for key in validation_set[0]})

In [None]:
len(test_dataset)

954393

In [None]:
len(validation_dataset)

954393

In [None]:

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    "TinyPixel/Llama-2-7B-bf16-sharded",
    quantization_config=bb_config
)


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [None]:
!pip install trl



In [None]:
from trl import SFTTrainer

In [None]:

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


In [None]:

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

(…)arded/resolve/main/tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)bf16-sharded/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)ded/resolve/main/special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:

from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
print(dataset.column_names)

['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']


In [None]:
def formatting_fraud_prediction(example):
    formatted_prompts = []

    for i in range(len(example['step'])):
        # Generate a question for each transaction
        question = f"Transaction Details:\n" \
                   f"Step: {example['step'][i]}\n" \
                   f"Type: {example['type'][i]}\n" \
                   f"Amount: {example['amount'][i]}\n" \
                   f"NameOrig: {example['nameOrig'][i]}\n" \
                   f"OldbalanceOrg: {example['oldbalanceOrg'][i]}\n" \
                   f"NewbalanceOrig: {example['newbalanceOrig'][i]}\n" \
                   f"NameDest: {example['nameDest'][i]}\n" \
                   f"OldbalanceDest: {example['oldbalanceDest'][i]}\n" \
                   f"NewbalanceDest: {example['newbalanceDest'][i]}\n"

        # Generate an answer indicating whether the transaction is fraud or not
        answer = f"IsFraud: {example['isFraud'][i]}"

        # Combine question and answer
        formatted_prompt = f"{question}\n{answer}"
        formatted_prompts.append(formatted_prompt)

    return formatted_prompts


In [None]:
max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset = validation_dataset,
    peft_config=peft_config,
    formatting_func=formatting_fraud_prediction,
    #dataset_text_field="combined_text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=train_args,
)

In [None]:

trainer.train() #train the model

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,2.9756
20,1.0103
30,0.0169
40,0.0001
50,0.0
60,0.0
70,0.0
80,0.0288
90,0.0002
100,0.0


TrainOutput(global_step=100, training_loss=0.40320664795901395, metrics={'train_runtime': 211.5047, 'train_samples_per_second': 1.891, 'train_steps_per_second': 0.473, 'total_flos': 98003386368000.0, 'train_loss': 0.40320664795901395, 'epoch': 0.0})