In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GPTQConfig,
    TrainingArguments,
    TrainerCallback,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import os
import tempfile
from huggingface_hub import login
from datetime import datetime as dt
import torch
# -----------------------------------------------------------------------------

## timer
def timer(func):
    def do(x):
        before = dt.now()
        perform = func(x)
        after = dt.now()
        print(after-before)
        return perform
    return do

# %%
token = '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'
login(token = token)

# preprocessing:


def prep_data():
    data = pd.read_csv(r'../ready_data.csv')
    # context had disctionary, only interested in the 'contexts' key

    # create data for asking Qs (aq)
    data_aq = pd.DataFrame(
        {
            "content": data.apply(
                lambda x: "###human: Ask me a questions about:\n"
                + x["full_context"]
                + "\n###Response:\n"
                + x["full_questions"]
                +" 2" +",
                axis=1,
            )
        }
    )
    data_aq = Dataset.from_pandas(data_aq)
    data_aq = data_aq.train_test_split(test_size=0.016)
    return data_aq

@timer
def finetune(data):
    train_data = data["train"]
    test_data = data["test"]

    ##load model and tokenizer
    model_id = "TheBloke/Mistral-7B-v0.1-GPTQ"

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    print(tokenizer.eos_token + 'HUISDHFIUEHUIFEHIUEUIFEHW')
    quantization_config_loading = GPTQConfig(
        bits=4, disable_exllama=True, tokenizer=tokenizer
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=quantization_config_loading, device_map="auto"
    )

    ##finetune:

    print(model)
    model.config.use_cache = False  # wont store intermediate states
    model.config.pretraining_tp = 1  # to replicate pre-training performance
    model.gradient_checkpointing_enable()  # compramise between forgetting activation states and remmebering them for backpropagation. Trade off computation time for GPU memory
    model = prepare_model_for_kbit_training(model)
    peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"],
    )
    model = get_peft_model(model, peft_config)
    name = "ADDING A 2"
    training_arguments = TrainingArguments(
        output_dir=name,
        per_device_train_batch_size=8, #5 works
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",
        learning_rate=3e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",  # so do we need the whole PeftSavingCallback function? maybe try withput and run the trainer(from last check=true)
        logging_steps=100,
        num_train_epochs=1,
        #max_steps=250,
        fp16=True,
        #push_to_hub=True,
        report_to=["tensorboard"],
    )

    # if checkpint doesnt work, might have to do save_steps=20 for example in the training argumnet above

    ############## checkpoint ##############################
    class PeftSavingCallback(TrainerCallback):
        def on_save(self, args, state, control, **kwargs):
            checkpoint_path = os.path.join(
                args.output_dir, f"checkpoint-{state.global_step}"
            )
            kwargs["model"].save_pretrained(checkpoint_path)

            if "pytorch_model.bin" in os.listdir(checkpoint_path):
                os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))

    callbacks = [PeftSavingCallback()]

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=test_data,
        peft_config=peft_config,
        dataset_text_field="content",
        args=training_arguments,
        tokenizer=tokenizer,
        callbacks=callbacks, #try if doesnt work hashing all of checkpiint stuff above and also this callback line
        packing=False,
    )

    ###################################################

    ########## set up tensorboard #####################
    #tmpdir = tempfile.TemporaryDirectory()
    #local_training_root = tmpdir.name

    #loc_checkpoint_path = os.path.join(local_training_root, name)
    #tensorboard_display_dir = f"{loc_checkpoint_path}/runs"

    #%load_ext tensorboard
    #%tensorboard --logdir f'{tensorboard_display_dir}'

    #########################################################

    #trainer.train(resume_from_checkpoint=True) #use if want to go from checkpoint
    trainer.train()
    #trainer.state.log_history()
    #trainer.save_model()
    #trainer.push_to_hub() #un hash when want to send final model to hub
    return trainer


# resume_from_checkpoint (str or bool, optional) — If a str, local path to a saved checkpoint as saved by a previous instance of Trainer. If a bool and equals True, load the last checkpoint in args.output_dir as saved by a previous instance of Trainer. If present, training will resume from the model/optimizer/scheduler states loaded here.







Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/seatond/.cache/huggingface/token
Login successful


In [2]:

# %% --------------------------------------------------------------------------
# RUN!
if __name__ == '__main__':
    trainer_obj = finetune(prep_data())



Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


</s>HUISDHFIUEHUIFEHIUEUIFEHW


You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)




Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


0:24:07.256782


In [4]:
model_id = "TheBloke/Mistral-7B-v0.1-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.eos_token)

</s>


In [5]:
#for inference
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, GenerationConfig, GPTQConfig
import torch

#model_id_inf = "seatond/mist_question_asking_5e4LR_2epoch" #2 epoch trained model on non enriched, non multi Q data
model_id_inf = "seatond/multiqaskingmistral"

tokenizer = AutoTokenizer.from_pretrained(model_id_inf, use_fast=True)

model = AutoPeftModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = model_id_inf,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda:0",
    token='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'
    )
print(tokenizer.eos_token)

Downloading tokenizer_config.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading adapter_config.json:   0%|          | 0.00/497 [00:00<?, ?B/s]



Downloading (…)er_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

</s>


In [10]:
context = "Halofantrine is a newly developed antimalarial drug used for the treatment of Plasmodium falciparum malaria. The introduction of this drug has been delayed because of its possible side effects, and due to insufficient studies on adverse reactions in humans. There have been no studies investigating its effect on hearing.", "Thirty guinea pigs were divided into three groups: a control group, a halofantrine therapeutic dose group and a halofantrine double therapeutic dose group. One cochlea specimen from each animal was stained with haematoxylin and eosin and the other with toluidine blue.", "No changes were detected in the control group. The halofantrine therapeutic dose group showed loss and distortion of inner hair cells and inner phalangeal cells, and loss of spiral ganglia cells. In the halofantrine double therapeutic dose group, the inner and outer hair cells were distorted and there was loss of spiral ganglia cells."
prompt_template = f'###human: Ask me question about:\n{context}\n###Response:\n'

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.to('cuda:0')

quantization_config_loading = GPTQConfig(bits=4)

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.5,
    max_new_tokens=30,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

outputs = model.generate(inputs=input_ids, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


###human: Ask me question about:
('Halofantrine is a newly developed antimalarial drug used for the treatment of Plasmodium falciparum malaria. The introduction of this drug has been delayed because of its possible side effects, and due to insufficient studies on adverse reactions in humans. There have been no studies investigating its effect on hearing.', 'Thirty guinea pigs were divided into three groups: a control group, a halofantrine therapeutic dose group and a halofantrine double therapeutic dose group. One cochlea specimen from each animal was stained with haematoxylin and eosin and the other with toluidine blue.', 'No changes were detected in the control group. The halofantrine therapeutic dose group showed loss and distortion of inner hair cells and inner phalangeal cells, and loss of spiral ganglia cells. In the halofantrine double therapeutic dose group, the inner and outer hair cells were distorted and there was loss of spiral ganglia cells.')
###Response:
Is halofantrine 

In [11]:
print(tokenizer.eos_token)
print(9)

</s>
9
