# Exp018: Conditional instruction fine-tuning
This experiment aims at instruction fine-tuning from existing skills in the dataset to train the model on single constraints.

In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
load_dotenv()
import os
os.environ['CACHE_DIR'] = f"/scratch/tmp.{os.getenv('SLURM_JOB_ID')}.dglandorf" # speed up model loading
os.environ['WANDB_DIR'] = os.getenv('CACHE_DIR')

from tqdm.notebook import tqdm
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments

import pickle
from torch.utils.data import RandomSampler, Subset
import numpy as np
import json
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import random
import sys
sys.path.append(f'../source')
import helpers
import models
import evaluation
import importlib
importlib.reload(evaluation)

[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<module 'evaluation' from '/cluster/home/dglandorf/grammarctg/experiments/../source/evaluation.py'>

In [2]:
# params
out_file = '../data/corpus_classification_all.pkl'
preprossed_dataset_file = '../data/SFT_data.jsonl'
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
nrs = [59] #[58, 616]#
classifiers = {nr: models.load_classifier(nr, "corpus_training") for nr in nrs}
EOP = "[/INST]"
egp = helpers.get_egp()

## Prepare dataset

In [3]:
with open(out_file, 'rb') as f:
    all_hit_indices = pickle.load(f)
    all_hit_sentences = pickle.load(f)
    extracts = pickle.load(f)

data = [{"context": extracts[idx][0], "response": extracts[idx][1], "nr": nr} for nr in nrs for idx in all_hit_indices[nr]]

In [4]:
def formatting_func(example):
    rules = egp[egp['#'].isin(example['nr'] if type(example['nr']) == list else [example['nr']])]
    constraints = os.linesep.join("- " + rules['SubCategory'] + ": " + rules['Can-do statement']) # " - " + rules['guideword']
    context = os.linesep.join([("A" if (i%2==0) else "B") + ": " + utt for i, utt in enumerate(example["context"])])

    instruction = f"""Write the response of A and include these grammatical items in the response:
{constraints}"""
   # instruction = 'Write an answer of A that includes the affirmative form of "would like".'
    
    prompt = f"""[INST] 
{instruction}
Dialog:
{context} {EOP} 
A: """
    completion = f"{example['response']}</s>"
    
    return prompt, completion, prompt+completion
    
with open(preprossed_dataset_file, 'w') as f:
    for item in tqdm(data):
        # line['prompt'], line['completion'] = formatting_func(item) # for completion chat format
        item['prompt'], item['completion'], item['text'] = formatting_func(item)
        #print(item)
        f.write(json.dumps(item) + '\n')

  0%|          | 0/2138 [00:00<?, ?it/s]

### Load dataset

In [5]:
dataset = load_dataset('json', data_files=preprossed_dataset_file, split='train', cache_dir=os.getenv('CACHE_DIR'))
train_test_split = dataset.train_test_split(test_size=0.05)
train_dataset, test_dataset = train_test_split['train'], train_test_split['test']

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset[4]

{'context': ["I don't think too many people would agree with you.  Since they know it's out there and about them.  All those sites are supposed to self-police and ban people who are abusers and if they don't they should be taken off-line.",
  'Again, sounds good in theory but everyone has different things that may hurt them or make them feel a certain way.  Everyone could be banned if you look at it that way',
  "No. I don't agree.  But changing focus here, did you know the UN appointed an official ambassador to extraterrestrials in 2010 in case we ever have contact?  I wonder who it is.",
  'No, I did not hear that do you have any guesses as to whom?'],
 'response': 'None whatsoever.  I will have to find out though!  Today is the shortest day of the year, when earth was first formed a day was only 5.5 hours long.  I guess our orbit changed (?)',
 'nr': 59,
 'prompt': "[INST] \nWrite the response of A and include these grammatical items in the response:\n- superlatives: Can use preposi

## Load and prepare base model

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
#model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=os.getenv('CACHE_DIR'), padding_side="right")
#tokenizer.pad_token = tokenizer.unk_token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding(32001, 4096)

### Inference with base model

In [8]:
def generate(prompts, max_new_tokens=128):
    tokenizer.padding_side = "left"
    model_input = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    tokenizer.padding_side = "right"
    model.eval()
    with torch.no_grad():
        token_ids = model.generate(**model_input, max_new_tokens=max_new_tokens, pad_token_id=32000)
    return tokenizer.batch_decode(token_ids[:,model_input['input_ids'].shape[1]:], skip_special_tokens=True, device="cpu")

In [9]:
example = random.choice(test_dataset)
#example = train_dataset[10]
#example['nr'] = [58, 616]
#example['text'] = formatting_func(example)
#print(example['text'])
print(example['nr'])

generate([example['prompt']])

59


["I'm in awe of the advancements in computer technology. It's mind-boggling to think about the implications for the art industry and museums. If a computer can create a Mona Lisa replica that fools even the experts, what does that mean for the future? And the pace of innovation is only accelerating. I've heard about the Russian-made computer that runs on water, it's incredible!\n\nB:  Indeed, it's a fascinating time we live in. Some people find it terrifying, but others, like the Transhumanists, embrace the idea of"]

## Evaluate outputs

In [16]:
def calc_metrics(contexts, outputs, constraints, eval_quality=False):
    scores = {nr: models.probe_model(classifier, outputs)[0]>0.5 for nr, classifier in classifiers.items()}
    distinct = [evaluation.calculate_distinct_n(list(np.array(outputs)[np.isin(constraints, nr)])) for nr in nrs]
    if eval_quality:
        iter_metrics = tqdm(evaluation.gpt_metrics.keys(), desc="Metrics", total=len(evaluation.gpt_metrics))
        iter_responses = lambda: tqdm(zip(contexts, outputs), desc="Responses", total=len(outputs))
        quality = {metric: [evaluation.get_single_response_metric(metric, context, output) for context, output in iter_responses()] for metric in iter_metrics}
    return scores, distinct, (quality if eval_quality else {})

In [17]:
def compute_metrics(eval_preds, verbose=False, n=25, datasets={"train": train_dataset, "test": test_dataset}, eval_quality=False):
    results = {}
    for name, ds in datasets.items():
        subset = dataset[RandomSampler(ds, num_samples=n)]
        if verbose: print(subset['prompt'][0])
        outputs = generate(subset['prompt'])
        scores, distinct, quality = calc_metrics(subset['context'], outputs, subset['nr'], eval_quality)
        if verbose:
            for truth, output in zip(subset['completion'], outputs):
                print(f"Truth: {truth}")
                print(f"Gener: {output}")
            print(f"Grammar detected: {scores}")
            print(f"Distinctiveness per constraint {distinct}")
            print(f"Quality: {quality}")
        print(list(zip(outputs,scores[nrs[0]]))[:10])
        
        results.update({f"{name}_success_{nr}": scores[nr].float().mean().item() for nr in classifiers.keys()})
        results.update({f"{name}_{metric}": np.mean(quality[metric]) for metric in quality.keys()})
        results.update({f"{name}_distinct": np.mean(distinct)})        
    return results

#compute_metrics([], verbose=False, n=25, datasets={"test": test_dataset}, eval_quality=True) # test

## Fine-tuning

In [12]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
    #modules_to_save=["embeddings"]
)

In [13]:
training_arguments = TrainingArguments(
    output_dir="../models/mistral_FT",
    num_train_epochs=1,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    #save_steps=25,
    logging_steps=5,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    run_name="gctg",
    #load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_eval_batch_size=8,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [14]:
collator = DataCollatorForCompletionOnlyLM("[/INST]", tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator=collator,
    #preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
    #neftune_noise_alpha=5,
)

Map:   0%|          | 0/2031 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

Step,Training Loss,Validation Loss,Train Success 59,Train Distinct,Test Success 59,Test Distinct
50,1.1587,1.586072,0.76,0.60373,0.8,0.510516
100,1.0927,1.547679,0.84,0.400289,0.96,0.546191
150,1.2625,1.51855,0.72,0.731343,0.8,0.661157
200,1.2182,1.498181,0.76,0.560538,0.8,0.50625
250,1.3587,1.47977,0.96,0.578292,0.96,0.514825
300,1.2729,1.473559,0.76,0.674121,0.72,0.620219


[('ya I wonder if they do, I wonder if they have the highest court in the land?', tensor(True)), ('I think so. He is the only losing coach in the history of the university.', tensor(True)), ('ya, I wonder if they still ban it in the highest court in the land?', tensor(True)), ('I would like to visit the oldest university in the US, Harvard University', tensor(False)), ("Nope, hockey is the most popular sport in Canada. It's the national sport of Canada.", tensor(True)), ('Yes, Daytona Beach is the most famous beach in the world for its hard packed sand, and is the only beach in the world with a speedway on it.', tensor(True)), ('they are the 6th most popular dog in the world', tensor(True)), ('I wonder if they have a brewery there?', tensor(False)), ("I know it's the most popular sport in the world.", tensor(True)), ("I'm not sure.  I'm not sure that the highest paid employee in the state of Kansas is worth it.  He's the only losing coach in the history of the University of Kansas.", t

TrainOutput(global_step=339, training_loss=1.5099244370924687, metrics={'train_runtime': 482.1455, 'train_samples_per_second': 4.212, 'train_steps_per_second': 0.703, 'total_flos': 1.910156281391232e+16, 'train_loss': 1.5099244370924687, 'epoch': 1.0})

In [19]:
compute_metrics([], verbose=False, datasets={"test": test_dataset}, n=25, eval_quality=True)

Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Responses:   0%|          | 0/25 [00:00<?, ?it/s]

Responses:   0%|          | 0/25 [00:00<?, ?it/s]

Responses:   0%|          | 0/25 [00:00<?, ?it/s]

Responses:   0%|          | 0/25 [00:00<?, ?it/s]

[("I'm sure that's the case.  The NFL is the most popular sport in the US.  It's also the most popular sport in Canada.  I wonder if they have any rules about that.", tensor(False)), ("I think so. It's the most popular coffee in the world.", tensor(True)), ("343 Industries is still making Halo games, and they are the best in the business. I'm not sure if they are still making Halo 3, but it's still the best in the series.", tensor(True)), ('1 billionth the size of the biggest star in our galaxy. I wonder if there is a star that is the size of the sun in our galaxy.', tensor(True)), ('I believe they do.  I know that the US Supreme Court has a softball field on the top floor of the building known as "the highest court in the land".', tensor(True)), ("Yeah! It's one of the most popular games in the world.", tensor(True)), ('I heard that they are the best band in the world', tensor(True)), ('100% they are the largest private employer in the world, they have a lot of money to throw around.'

{'test_success_59': 0.8399999737739563,
 'test_Appropriateness': 2.76,
 'test_Relevance': 2.48,
 'test_Content Richness': 2.8,
 'test_Grammatical Correctness': 3.96,
 'test_distinct': 0.5678233438485805}

In [18]:
#trainer.save_model("../models/mistral_FT_2")

In [21]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)
#model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
#model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=os.getenv('CACHE_DIR'), padding_side="right")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, "../models/mistral_FT")
