In [1]:
%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from qlora import *
from collections import defaultdict
import copy
import json
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import bitsandbytes as bnb
import pandas as pd
import importlib
from packaging import version
from packaging.version import parse
import warnings
from sklearn.metrics.pairwise import manhattan_distances
from torchmetrics.functional.pairwise import pairwise_manhattan_distance as manhattan
from torchmetrics.functional.pairwise import pairwise_cosine_similarity as cossim
import numpy as np

import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    set_seed,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    LlamaTokenizer

)
from datasets import load_dataset, Dataset, load_from_disk
import evaluate

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from transformers.utils import is_peft_available
from peft import PeftModel



In [2]:
argdict = {
  'model_name_or_path' : './mhllama',
  'output_dir': '/mnt/data/sonia/ckpts/grid2/r64_a16_lr1e-3_wd0',
  'num_heads': 7,
  'max_column_len': 5,
  'data_seed' : 42 ,
  'do_eval': True,
  'eval_dataset_size' : 5 ,
  'max_eval_samples' : 2 ,
  'per_device_eval_batch_size' : 1 ,
  'dataloader_num_workers' : 1 ,
  'group_by_length' : True,
  'remove_unused_columns' : False ,
  'lora_r' : 64 ,
  'lora_alpha' : 16 ,
  'lora_modules' : 'all' ,
  'double_quant' : True,
  'quant_type' : 'nf4' ,
  'bf16' : True,
  'bits' : 4 ,
  'dataset' : '/mnt/data/sonia/datasets/adult/may8.dat',
  'source_max_len' : 60 ,
  'target_max_len' : 60 ,
  'seed' : 0
}

arglist = [f'--{k}={v}' for k,v in argdict.items()]

In [3]:
hfparser = transformers.HfArgumentParser((
    ModelArguments, DataArguments, TrainingArguments, GenerationArguments
))
model_args, data_args, training_args, generation_args  = hfparser.parse_args_into_dataclasses(args=arglist, return_remaining_strings=True)[:-1]
training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args)
)

In [4]:
checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
model, tokenizer = get_accelerate_model(args, checkpoint_dir)
model.config.use_cache = False
    
print('loaded model')
set_seed(args.seed)

Found a previous checkpoint at: /mnt/data/sonia/ckpts/grid2/r64_a16_lr1e-3_wd0/checkpoint-60
loading base model ./mhllama...


  return self.fget.__get__(instance, owner)()


# Train

In [6]:
data_module = make_data_module(tokenizer=tokenizer, args=args)
collator = data_module['data_collator']
datatr = data_module['train_dataset']
model.set_templates(collator.get_templates())

["This person's age is", ' sex is', ' and country is', '. Education level is', ' occupation is', ' and income is', '']


In [46]:
from transformers.integrations import WandbCallback
trainerclass = Seq2SeqTrainer
trainer = trainerclass(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
)
class evalSampleCallback(transformers.TrainerCallback):
    def on_evaluate(self, args, state, control, model, **kwargs):
        trainer.model.eval()
        metrics = trainer.predict(test_dataset=data_module['eval_dataset'],metric_key_prefix="predict")
        
        predictions = []
        for i in range(len(metrics.predictions)):
            logit = metrics.predictions[i]
            label = metrics.label_ids[i] #just to see positions where prompt tokens are at
            logit_abcd = logit[label != IGNORE_INDEX]
            toks = np.argmax(logit_abcd, axis=1)
            predictions.append(
                ''.join(trainer.tokenizer.decode(toks, skip_special_tokens=True, clean_up_tokenization_spaces=True))
                )
        
        for pred in predictions:
            print(pred)
            
class WandbMetricsCallback(WandbCallback):
    def on_substep_end(self, args, state, control, **kwargs):
        self._wandb.log(model.to_log)
    
    
trainer.add_callback(evalSampleCallback)
trainer.add_callback(WandbMetricsCallback)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [47]:
trainer.model.set_trace(True)

In [48]:
model.prompt_template, model.head_inds

(tensor([ 1576, 16500, 26552,     0,     0,     0,     0,     0,     0,     3,
          4520,   263,     0,     0,     0,     0,     0,     0,     3, 24876,
         19263,  1156,     0,     0,     0,     0,     0,     0,     3,  3841,
           297, 13457,   411,   263,     0,     0,     0,     0,     0,     0,
             3, 11619,   988,   896,  1090, 29893,   296,     0,     0,     0,
             0,     0,     0,     3, 28648,   322,   892,  2225, 23059,     0,
             0,     0,     0,     0,     0,     3, 13589,   800, 29889,   512,
           278,  4940,  1629,   896,   750,     0,     0,     0,     0,     0,
             0,     3, 11176, 14703,  5716,  1998,  1169,     0,     0,     0,
             0,     0,     0,     3,   714,  5031,   993,  8167,  1860,   322,
           892, 18973,     0,     0,     0,     0,     0,     0,     3,     2]),
 tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 3, 3,
         3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 4, 4, 4, 

In [51]:
tokenizer.decode([1,  1576, 16500, 26552, 29871, 29906, 29900,   304, 29871, 29945,
             3,  4520,   263,  3077,  1810,   359,   446,  1026,   284,     3,
         24876, 19263,  1156, 29871, 29941,     0,     0,     0,     0,     3,
          3841,   297, 13457,   411,   263,  7463, 26602, 11619,     0,     0,
             0,     3, 11619,   988,   896,  1090, 29893,   296, 29871, 29896,
             0,     0,     0,     0,     3, 28648,   322,   892,  2225, 23059,
         29871, 29896, 29945,     0,     0,     0,     3, 13589,   800, 29889,
           512,   278,  4940,  1629,   896,   750, 29871, 29900,     0,     0,
             0,     0,     3, 11176, 14703,  5716,  1998,  1169, 29871, 29900,
             0,     0,     0,     0,     3,   714,  5031,   993,  8167,  1860,
           322,   892, 18973,   451,  1303, 29885,  4430,     0,     0,     3,
             2])

'<s>The patient aged 20 to 5<0x00> received a Musculoskeletal<0x00> diagnosis after 3<unk><unk><unk><unk><0x00> days in hospital with a internal medicine doctor<unk><unk><unk><0x00> doctor where they underwent 1<unk><unk><unk><unk><0x00> procedures and were prescribed 15<unk><unk><unk><0x00> medications. In the past year they had 0<unk><unk><unk><unk><0x00> emergency room visits 0<unk><unk><unk><unk><0x00> outpatient appointments and were ultimately not readmitted<unk><unk><0x00></s>'

In [50]:
trainer.train()

in forward. input_ids tensor([[    1,  1576, 16500, 26552, 29871, 29906, 29900,   304, 29871, 29945,
             3,  4520,   263,  3077,  1810,   359,   446,  1026,   284,     3,
         24876, 19263,  1156, 29871, 29941,     0,     0,     0,     0,     3,
          3841,   297, 13457,   411,   263,  7463, 26602, 11619,     0,     0,
             0,     3, 11619,   988,   896,  1090, 29893,   296, 29871, 29896,
             0,     0,     0,     0,     3, 28648,   322,   892,  2225, 23059,
         29871, 29896, 29945,     0,     0,     0,     3, 13589,   800, 29889,
           512,   278,  4940,  1629,   896,   750, 29871, 29900,     0,     0,
             0,     0,     3, 11176, 14703,  5716,  1998,  1169, 29871, 29900,
             0,     0,     0,     0,     3,   714,  5031,   993,  8167,  1860,
           322,   892, 18973,   451,  1303, 29885,  4430,     0,     0,     3,
             2]], device='cuda:0') label tensor([[    1,  1576, 16500, 26552, 29871, 29906, 29900,   304, 298

KeyboardInterrupt: 

# Generation

In [5]:
# tokenizer = AutoTokenizer.from_pretrained('/mnt/data/zoo/llama2/llama2-7b-hf/',
#         padding_side="right",
#         use_fast=False, # Fast tokenizer giving issues.
#         )
data_module = make_data_module(tokenizer=tokenizer, args=args)
collator = data_module['data_collator']
model.set_templates(collator.get_templates())

["This person's age is", ' sex is', ' and country is', '. Education level is', ' occupation is', ' and income is', '']


In [None]:
# from peft import PeftModel
# model.set_templates(collator.get_templates())
# model = PeftModel.from_pretrained(model, join('/mnt/data/sonia/ckpts/grid2/r64_a16_lr1e-3_wd0/checkpoint-60', 'adapter_model'), is_trainable=True)
# model = model.merge_and_unload()

In [17]:
model.set_trace(False)

In [6]:
inputs = collator(50*[{'length': 0}])
inputs = {c:inputs[c].to(model.device) for c in inputs}

In [7]:
out = model.generate(**inputs)
out



(tensor([[    1,  4013,  2022,  ..., 29900, 29968,  6213],
         [    1,  4013,  2022,  ..., 29900, 29968,  6213],
         [    1,  4013,  2022,  ..., 29900, 29968,  6213],
         ...,
         [    1,  4013,  2022,  ..., 29900, 29968,  6213],
         [    1,  4013,  2022,  ..., 29900, 29968,  6213],
         [    1,  4013,  2022,  ..., 29900, 29968,  6213]], device='cuda:0'),
 tensor([[[29871, 29953, 29945,     0,     0],
          [27208,     0,     0,     0,     0],
          [ 3303,  3900,     0,     0,     0],
          [13620,   295,   943,     0,     0],
          [11080,  8455,   616,     0,  6189],
          [  975, 29871, 29945, 29900, 29968]],
 
         [[29871, 29906, 29929,     0,     0],
          [27208,     0,     0,     0,     0],
          [ 3303,  3900,     0,     0,     0],
          [13620,   295,   943,     0,     0],
          [ 6189,  1015,  8681,   312,     0],
          [  975, 29871, 29945, 29900, 29968]],
 
         [[29871, 29906, 29941,     0,     

In [None]:
tokenizer.batch_decode(out[0])

['<s>The patient aged not read<unk><unk> None received a<unk><unk><unk><unk> None diagnosis after<unk><unk><unk><unk> None days in hospital with a<unk><unk><unk><unk> None doctor where they underwent<unk><unk><unk><unk> None procedures and were prescribed<unk><unk><unk><unk> None medications. In the past year they had<unk><unk><unk><unk> None emergency room visits 0<unk><unk> None outpatient appointments and were ultimately<unk><unk><unk><unk> None']

## Cos distance

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
full_dataset = DatasetDict({})
for f in os.listdir(args.dataset):
    if f.endswith('.json'): continue
    full_dataset[f] = load_from_disk(os.path.join(args.dataset, f))
real = full_dataset['train'].to_pandas().drop(['length'], axis=1)
real.head()

Unnamed: 0,age,sex,native-country,education,occupation,income
0,22,Male,United States,Some-college,Protective serv,under 50K
1,36,Male,United States,HS-grad,Other service,under 50K
2,29,Female,United States,Assoc-voc,Exec managerial,under 50K
3,18,Male,United States,HS-grad,Sales,under 50K
4,67,Male,United States,Some-college,Sales,over 50K


In [81]:
preds = [ [] for _ in range(real.shape[1]) ]
batch_size = 50
num_samples = 225 # real.shape[0]
inputs = collator(batch_size*[{'length': 0}])

for batch in range(225//50 + 1):
    _, batch_col_toks = model.generate(**inputs) # batch_size x num_cols x max_column_len

    for i, col in enumerate(real.columns):
        options_str = real[col].unique()
        options = tokenizer(options_str.tolist(), add_special_tokens=False, padding='max_length', return_tensors='pt', 
                            max_length=args.generation_config.max_column_len, truncation=True)['input_ids']
        preds_col = options_str[cosine_similarity(batch_col_toks[:, i, :], options).argmax(axis=1)]
        preds[i].extend(preds_col)

preds = pd.DataFrame(preds).T
preds.head()



prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f180e5794d0>]




prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f1876610a50>]




prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f1876d3bf10>]




prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f180f252110>]




prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f180e5b5650>]


Unnamed: 0,0,1,2,3,4,5
0,62,Female,United States,5th-6th,Tech support,under 50K
1,41,Female,United States,7th-8th,Protective serv,under 50K
2,55,Female,United States,Bachelors,Transport moving,under 50K
3,47,Female,United States,Some-college,Protective serv,under 50K
4,38,Female,United States,Some-college,Exec managerial,under 50K


In [82]:
hp = datasets.Dataset.from_pandas(preds)
hp.save_to_disk(path)

(250, 6)

# Normal llama

In [2]:
from transformers import AutoTokenizer, LlamaForCausalLM

model = LlamaForCausalLM.from_pretrained("/mnt/data/zoo/llama2/llama2-7b-hf/")
tokenizer = AutoTokenizer.from_pretrained("/mnt/data/zoo/llama2/llama2-7b-hf/")

prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:
# Generate
generate_ids = model.generate(inputs.input_ids, do_sample=True, num_beams=1)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

prepared_logits_processor []
prepared_stopping_criteria [<transformers.generation.stopping_criteria.MaxLengthCriteria object at 0x7f5db05c9ad0>]


'Hey, are you conscious? Can you talk to me? I’m a doctor. I'