## ! pip install datasets
! pip install -U evaluate
! pip install -U bitsandbytes
! pip install wandb
! pip install -U transformers
! pip install -U huggingface_hub
# ! export _HF_DEFAULT_ENDPOINT=https://hf-mirror.com
! pip install peft==0.12.0
! pip install SentencePiece
! pip install adam-mini

In [2]:
import os
os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
import sys
import logging
import datasets
import evaluate
import bitsandbytes
import pandas as pd
import numpy as np

import wandb
os.environ['WANDB_API_KEY'] = "a464ce6c3b972e3e7090ac20839b9a1daac1b608"

from adam_mini import Adam_mini

from transformers import AutoModelForSequenceClassification, DebertaV2Tokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.model_selection import train_test_split

from transformers import BitsAndBytesConfig


train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)



if __name__ == '__main__':
#     os.environ["WANDB_PROJECT"] = "deberta_lora_adam-mini"
    os.environ["WANDB_PROJECT"] = "deberta_lora_base_adam-mini"
    wandb.init()
    
    train, val = train_test_split(train, test_size=.2)

    train_dict = {'label': train["sentiment"], 'text': train['review']}
    val_dict = {'label': val["sentiment"], 'text': val['review']}
    test_dict = {"text": test['review']}

    train_dataset = datasets.Dataset.from_dict(train_dict)
    val_dataset = datasets.Dataset.from_dict(val_dict)
    test_dataset = datasets.Dataset.from_dict(test_dict)

    # batch_size = 32

    model_id = "microsoft/deberta-v3-base"

    tokenizer = DebertaV2Tokenizer.from_pretrained(model_id)
    


    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True,padding=True,max_length=510)
    # 尝试加入max_length


    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)
    tokenized_test = test_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
#         quantization_config = BitsAndBytesConfig(load_in_4bit=True),
        # load_in_8bit = True,
        # device_map="auto",
        # load_in_8bit=True
    )
    
    # Define LoRA Config
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        # target_modules=['q_proj', 'v_proj'],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_CLS
    )

    # prepare int-8 model for training
    # model = prepare_model_for_int8_training(model)

    # add LoRA adaptor
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    

  from .autonotebook import tqdm as notebook_tqdm
Unable to read the token file at /var/run/secrets/kubernetes.io/serviceaccount/token due to permission error ([Errno 13] Permission denied: '/var/run/secrets/kubernetes.io/serviceaccount/token').The current user id is 1000. Consider changing the securityContext to run the container as the current user.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mskyfurynowonline[0m ([33mskyfurynowonline-yunnan-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Map: 100%|██████████| 20000/20000 [00:22<00:00, 902.47 examples/s]
Map: 100%|██████████| 5000/5000 [00:05<00:00, 892.99 examples/s]
Map: 100%|██████████| 25000/25000 [00:27<00:00, 912.53 examples/s]
  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 185,015,044 || trainable%: 0.3196


In [None]:
    metric = evaluate.load("accuracy")
    
    optimizer = Adam_mini(
    named_parameters=model.named_parameters(),
    lr=2e-5,
    betas=(0.9, 0.999),
    weight_decay=0.01,
    # 其他参数
    )


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return  metric.compute(predictions=predictions, references=labels)


    training_args = TrainingArguments(
        output_dir='./checkpoint',  # output directory
#         report_to="wandb",
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=2,  # batch size per device during training
        per_device_eval_batch_size=4,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,
        save_strategy="no",
        evaluation_strategy="steps",
#         fp16=True, # 开启混合精度 
#         gradient_accumulation_steps=4,# 累积 4 个小批次
    )

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=tokenized_train,  # training dataset
        eval_dataset=tokenized_val,  # evaluation dataset
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers = (optimizer,None),
    )

    trainer.train()
    
    prediction_outputs = trainer.predict(tokenized_test)
    test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()


    result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
    if not os.path.exists("./result"):
        os.mkdir("./result")
    result_output.to_csv("./result/deberta_lora_base_adam-mini.csv", index=False, quoting=3)
    wandb.finish()

Downloading builder script: 4.20kB [00:00, 5.41MB/s]                   
  trainer = Trainer(


Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_A.default.weight torch.Size([16, 768])
Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_B.default.weight torch.Size([768, 16])
Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_A.default.weight torch.Size([16, 768])
Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_B.default.weight torch.Size([768, 16])
Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_A.default.weight torch.Size([16, 768])
Adam-mini found the param block with name: base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_B.default.weight torch.Size([768, 16])
Adam-mini found the param block with name: base_model.model.deberta.encoder.



Adam-mini found 0 embedding layers, 0 output layers; 0 Querys and Keys;  0 Values;  0 attn_proj;  0 MLPs;


Step,Training Loss,Validation Loss,Accuracy
100,0.6967,0.694016,0.4988
200,0.6918,0.693748,0.4988
300,0.6991,0.6933,0.4988
400,0.692,0.693,0.5004
500,0.694,0.692672,0.501
600,0.6968,0.692052,0.5038
700,0.6948,0.691427,0.5932
800,0.6934,0.687974,0.5152
