# Exp018: Conditional instruction fine-tuning
This experiment aims at instruction fine-tuning from existing skills in the dataset to train the model on single constraints.

In [1]:
from datasets import load_dataset
from dotenv import load_dotenv
load_dotenv()
import os

from tqdm.notebook import tqdm
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments

import pickle
from torch.utils.data import RandomSampler, Subset
import numpy as np
import json
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import random
import sys
sys.path.append(f'../source')
import helpers
import models
import importlib
#importlib.reload(models)



In [2]:
# params
out_file = '../data/corpus_classification_all.pkl'
preprossed_dataset_file = '../data/SFT_data.jsonl'
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
nrs = [616] #[58, 616]#
classifier = models.load_classifier(616, "corpus_training")
EOP = "[/INST]"
egp = helpers.get_egp()

## Prepare dataset

In [5]:
with open(out_file, 'rb') as f:
    all_hit_indices = pickle.load(f)
    all_hit_sentences = pickle.load(f)
    extracts = pickle.load(f)

data = [{"context": extracts[idx][0], "response": extracts[idx][1], "nr": nr} for nr in nrs for idx in all_hit_indices[nr]]

In [6]:
def formatting_func(example):
    rules = egp[egp['#'].isin(example['nr'] if type(example['nr']) == list else [example['nr']])]
    constraints = os.linesep.join("- " + rules['SubCategory'] + ": " + rules['Can-do statement']) # " - " + rules['guideword']
    context = os.linesep.join([("A" if (i%2==0) else "B") + ": " + utt for i, utt in enumerate(example["context"])])

    instruction = f"""Write the response of A and include this grammatical items in the response.
{constraints}"""
   # instruction = 'Write an answer of A that includes the affirmative form of "would like".'
    
    prompt_completion = f"""[INST] 
{instruction}
Dialog:
{context} {EOP} 
A: {example['response']}</s>"""
    
    return prompt_completion, prompt_completion.index(EOP)+len(EOP)
    
with open(preprossed_dataset_file, 'w') as f:
    for item in tqdm(data):
        # line['prompt'], line['completion'] = formatting_func(item) # for completion chat format
        item['text'], item['prompt_len']  = formatting_func(item)
        #print(item)
        f.write(json.dumps(item) + '\n')

  0%|          | 0/3515 [00:00<?, ?it/s]

### Load dataset

In [7]:
dataset = load_dataset('json', data_files=preprossed_dataset_file, split='train', cache_dir=os.getenv('CACHE_DIR'))
train_test_split = dataset.train_test_split(test_size=0.05)
train_dataset, test_dataset = train_test_split['train'], train_test_split['test']

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
dataset[0]

{'context': ['I need to transfer money.',
  'Do you know which account you want to take the money from?',
  'From my savings account.',
  'Where are you transferring the money to?'],
 'response': 'I would like it transferred to my checking account.',
 'nr': 616,
 'text': "[INST] \nWrite the response of A and include this grammatical items in the response.\n- would: Can use the affirmative form with 'like'. \nDialog:\nA: I need to transfer money.\nB: Do you know which account you want to take the money from?\nA: From my savings account.\nB: Where are you transferring the money to? [/INST] \nA: I would like it transferred to my checking account.</s>",
 'prompt_len': 314}

## Load and prepare base model

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)

In [10]:
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
#model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=os.getenv('CACHE_DIR'), padding_side="right")
#tokenizer.pad_token = tokenizer.unk_token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding(32001, 4096)

### Inference with base model

In [11]:
example = random.choice(test_dataset)
#example = train_dataset[10]
#example['nr'] = [58, 616]
#example['text'] = formatting_func(example)
#print(example['text'])

#converted_sample = [
#    {"role": "user", "content": example["prompt"]},
#    #{"role": "assistant", "content": example["completion"]},
#]
#model_input = tokenizer.apply_chat_template(converted_sample, return_tensors="pt").to(device)

eval_prompt = example['text'][:example['prompt_len']+4]
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)

model.eval()
with torch.no_grad():
    token_ids = model.generate(**model_input, max_new_tokens=1024, pad_token_id=32000)[0]
    output_text = tokenizer.decode(token_ids, skip_special_tokens=True)
print(output_text)

[INST] 
Write the response of A and include this grammatical items in the response.
- would: Can use the affirmative form with 'like'. 
Dialog:
A: I do too! Do you enjoy fiction?  It uses imagination!
B: Yes I love reading but cannot do speed reading like Anne Jones - 4700 words a minute!
A: Yes, I know she is unreal! Do you know how they measure that? They don't have to understand what they're reading?
B: I think they do understand it!  But at least there is no moral panic these days about reading like there was in the 18th century! [/INST] 
A: I agree! I also enjoy fiction and appreciate the use of imagination it entails. I was just wondering how they measure someone's reading speed like Anne Jones'. I guess they don't necessarily have to understand what they're reading to achieve such high numbers.

B: Yes, I love reading as well, but I can't read as fast as Anne Jones. I believe they do understand what they're reading, even if they're skimming through it quickly. And you're right, 

In [12]:
def compute_metrics(eval_preds, verbose=False, num_samples=25, datasets={"train": train_dataset, "test": test_dataset}):
    if verbose: print("EPOCH", "___" * 20)
    all_scores = {}
    for name, ds in datasets.items():
        random_sampler = RandomSampler(ds, num_samples=num_samples)
        subset = ds[random_sampler]
        prompts = [text[:prompt_len+4] for text, prompt_len in zip(subset['text'], subset['prompt_len'])]
        if verbose: print(prompts)
        tokenizer.padding_side = "left"
        model_input = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        tokenizer.padding_side = "right"
        model.eval()
        with torch.no_grad():
            token_ids = model.generate(**model_input, max_new_tokens=128, pad_token_id=32000)
            outputs = tokenizer.batch_decode(token_ids[:,model_input['input_ids'].shape[1]:], skip_special_tokens=True, device="cpu")
        scores = models.probe_model(classifier, outputs)[0]>0.5
        all_scores[name] = scores
        
        if verbose:
            truths = [text[prompt_len+4:-4] for text, prompt_len in zip(subset['text'], subset['prompt_len'])]
            for truth, output in zip(truths, outputs):
                print(f"Truth: {truth}")
                print(f"Gener: {output}")
            print(f"Grammar detected: {scores}")
        print(list(zip(outputs,scores))[:10])
        
    return {f"success_{name}": all_scores[name].float().mean().item() for name in datasets.keys()}

#compute_metrics([], verbose=False, datasets={"test": test_dataset}) # test

## Fine-tuning

In [13]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
    #modules_to_save=["embeddings"]
)

In [20]:
training_arguments = TrainingArguments(
    output_dir="../models/mistral_FT",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    #save_steps=25,
    logging_steps=5,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    run_name="gctg",
    #load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_eval_batch_size=8,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [21]:
#eval_dataset = dataset.train_test_split(test_size=0.001)
#eval_dataset = eval_dataset["test"]
collator = DataCollatorForCompletionOnlyLM("[/INST]", tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator=collator,
    #preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
    #neftune_noise_alpha=5,
)

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
trainer.train()

Step,Training Loss,Validation Loss,Success Train,Success Test
50,1.2257,1.82333,1.0,0.92
100,1.2383,1.762281,1.0,0.96
150,1.2597,1.72129,1.0,1.0


[("I'd like to try it.  Do you know where I can find a court?", tensor(True)), ("I would like to be a student of Jon Hamm's. I would like to know what he would be like as a teacher.", tensor(True)), ('I would like a kitchen that is warm and welcoming.', tensor(True)), ('I would like to go to the beach more often.', tensor(True)), ('I would like to see the first video game. I would like to see the first video game.', tensor(True)), ('I would like to go to Japan and visit the temples and shrines. I would like to see the cherry blossoms in Japan', tensor(True)), ('I would like to go to the South by Southwest festival', tensor(True)), ("I'd like to go to Shanghai. I'd like to see the Bund and the Oriental Pearl Tower.", tensor(True)), ("I would like to.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it.  I'd like to read more of it. 

KeyboardInterrupt: 

In [19]:
#trainer.save_model("../models/mistral_FT_2")



In [21]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4"
)
#model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, cache_dir=os.getenv('CACHE_DIR'), device_map="auto")
#model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=os.getenv('CACHE_DIR'), padding_side="right")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, "../models/mistral_FT")
