In [6]:
import torch
from datasets import load_dataset
from peft import LoraConfig , get_peft_model , prepare_model_for_int8_training
from transformers import AutoModelForCausalLM ,AutoTokenizer ,TrainingArguments
from trl import SFTTrainer

In [10]:
def train():
    train_dataset=load_dataset('tatsu-lab/alpaca',split="train")
    tokenizer=AutoTokenizer.from_pretrained("Salesforce/xgen-7b-8k-base",trust_remote_code=True)
    tokenizer.pad_token=tokenizer.eos_token
    model=AutoModelForCausalLM.from_pretrained("Salesforce/xgen-7b-8k-base",load_in_4bit=True,torch_dtype=torch.float16,device_map="auto")
    model.resize_token_embedings(len(tokenizer))
    model=prepare_model_for_int8_training(model)
    peft_config=LoraConfig(r=16 ,lora_alpha=32,lora_dropout=0.05,bias="none",task_type="CAUSAL_LM")
    model=get_peft_model(model,peft_config)

    traing_args=TrainingArguments(
        output_dir="xgen_lora_train",
        per_device_eval_batch_size=4,
        optim="adamw_torch",
        logging_steps=100,
        learning_rate=2e-4,
        fp16=True,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        num_train_epochs=1,
        save_strategy="epoch",
        push_to_hub=True,
        peft_config=peft_config

    )

    trainer= SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        dataset_text_feild="text",
        max_seq_length=1024,
        tokenizer=tokenizer,
        args=traing_args,
        packing=True
    )
    trainer.train()
    trainer.push_to_hub()



In [11]:
train()

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading and preparing dataset parquet/tatsu-lab--alpaca to C:/Users/sackel/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/sackel/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


Downloading (…)okenizer_config.json:   0%|          | 0.00/329 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)tokenization_xgen.py:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/xgen-7b-8k-base:
- tokenization_xgen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


Downloading (…)lve/main/config.json:   0%|          | 0.00/510 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]