# 417 try 3, try to increase dataset from 1300 to ~2000 by increase threshold from 0.65 to 0.6 
# early stopping 5
# llama2 & qlora
# r=64
# alpha=16
# learning_rate=1e-4
# threshold = 0.6
# question: title +selftext
# answer:falcon_summary

# Install packages

In [1]:
 !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 wandb

[0m

# Authenticate with wandb, gdrive

In [2]:
import wandb

# Import packages

In [3]:
# Import necessary packages for the fine-tuning process
import os                          # Operating system functionalities
import torch                       # PyTorch library for deep learning
from datasets import load_dataset,DatasetDict,Dataset  # Loading datasets for training
from transformers import (
    AutoModelForCausalLM,          # AutoModel for language modeling tasks
    AutoTokenizer,                # AutoTokenizer for tokenization
    BitsAndBytesConfig,           # Configuration for BitsAndBytes
    HfArgumentParser,             # Argument parser for Hugging Face models
    TrainingArguments,            # Training arguments for model training
    pipeline,                     # Creating pipelines for model inference
    logging,                      # Logging information during training
)
from peft import LoraConfig, PeftModel  # Packages for parameter-efficient fine-tuning (PEFT)
from trl import SFTTrainer         # SFTTrainer for supervised fine-tuning
from datetime import datetime
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
!huggingface-cli whoami

chriztopherton


Input huggingface token, no need to Add token as git credential? n

# Initialize model and dataset by name

In [5]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-hf"

#model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# The instruction dataset to use
#dataset_name = "chriztopherton/Reddit_RAFT"

# Fine-tuned model name
new_model = "llama-2-7b_qlora_falcon_417_try_3"

#new_model = "mistral-2-7b-reddit_qlora_falcon"

# Define LLM model parameters

In [6]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64 # used to be 64


# Alpha parameter for LoRA scaling
lora_alpha = 16        # used to be 16 #let's try r256,alpha128 next

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [7]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [8]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
#original code: output_dir = "./chris_ft/ft_v2"
output_dir = "./chris_ft_417/llama2_ft_417_try_3"

In [9]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load dataset and train/val/test split

# skip this step for now

In [10]:
data = pd.read_csv("reddit_dot_scores_quality.csv",index_col="Unnamed: 0")
filtered_data=data.loc[(data['title_query_falcon_dot'])>0.6]   #0.6
dataset = Dataset.from_pandas(filtered_data)


In [11]:
# Step 1 : Load dataset (you can process it here)
#dataset = load_dataset(dataset_name, split="train")

In [12]:
train_test_dataset = dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_dataset['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [13]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__'],
        num_rows: 2214
    })
    test: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__'],
        num_rows: 124
    })
    valid: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__'],
        num_rows: 123
    })
})

In [14]:
path_saved="./417_datasplit_0.6"
import pickle
with open(path_saved+"train_test_dataset.pkl","wb") as f:
    pickle.dump(train_test_dataset,f)

with open(path_saved+"test_valid_dataset.pkl","wb") as f:
    pickle.dump(test_valid,f)

with open(path_saved+"train_test_valid_dataset.pkl","wb") as f:
    pickle.dump(train_test_valid_dataset,f)


# start from here NO!!

In [9]:
pwd

'/app'

In [17]:
cd ..

/


In [None]:
import pickle
with open(path_saved+"train_test_dataset.pkl","wb") as f:
    train_test_dataset = pickle.load(f)

with open(path_saved+"test_valid_dataset.pkl","wb") as f:
    test_valid_dataset = pickle.load(f)

with open(path_saved+"train_test_valid_dataset.pkl","wb") as f:
    train_test_valid_dataset = pickle.load(f)


## Continue from here

In [15]:
train_test_valid_dataset['train'][0]

{'title': 'Restaurant service Netherlands',
 'selftext': 'Recently encountered this on a restaurant menu in the Netherlands. Is this normal?',
 'comments': 'Aside from everything else, thats also not... how you spell Tikkie? Right?, Im unreasonably annoyed they misspelled Tikkie, Oh, youre allergic to peanuts. Then Id advice you not to eat peanuts., deleted, Our advice is stop being allergic, You cannot GO DUTCH in Netherlands?!, I know exactly what place it is. Its in maastricht. Tap water, half a whiskey glass for cent. Water in the toilet tap tightened up so much it barely trickles so you cant drink it either. Theyre crazy., Yes, its normal not to bring your drinks into a restaurant The reason they dont want to split the bill might be time shortage. Splitting the bill can last for a long time, for a restaurant its much more easier to charge one person with the bill. Its not very common, but it happends Its pretty common to let the restaurant know whether you have any alergies The re

# we run everything on comments up until 4/16 
def transform_data(example):
    ques = example['selftext']
    ans = example['comments']
    return {
        "text": f"<s>[INST] {ques} [/INST] {ans} </s>"
    }

transformed_dataset = train_test_valid_dataset.map(transform_data)
transformed_dataset

In [15]:
def transform_data(example):
    ques = example['title']+ " " + example['selftext']
    ans = example['falcon_summary']
    return {
        "text": f"<s>[INST] {ques} [/INST] {ans} </s>"
    }

transformed_dataset = train_test_valid_dataset.map(transform_data)
transformed_dataset

Map: 100%|██████████| 2214/2214 [00:00<00:00, 12333.63 examples/s]
Map: 100%|██████████| 124/124 [00:00<00:00, 10030.93 examples/s]
Map: 100%|██████████| 123/123 [00:00<00:00, 9842.03 examples/s]


DatasetDict({
    train: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__', 'text'],
        num_rows: 2214
    })
    test: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__', 'text'],
        num_rows: 124
    })
    valid: Dataset({
        features: ['title', 'selftext', 'comments', 'falcon_summary', 'title_comments_dot', 'questions_comments_dot', 'title_query_comments_dot', 'title_query_falcon_dot', 'falcon_title_dot', 'falcon_questions_dot', 'falcon_comments_dot', '__index_level_0__', 'text'],
        num_rows: 123
    })
})

In [16]:
transformed_dataset['train'][0]

{'title': 'First timers in UK and France',
 'selftext': 'Planning a week trip in the UK. Were landing in London but flying back out of Paris. Any ideas on how to hit the UK and France in a days. Must see destinations will be, Big Ben, Buckingham Palace, Westmister Abbey, castles in Ireland a day or two in Ireland, the Louvre and the Eiffel Tower. Any tip and advice is welcome. Thanks',
 'comments': 'Hey sounds like a nice trip idea but not sure you can put Ireland into the box. Or you should fly in Ireland, then London and fly back from France but in a week seems hard to me. I am French and i know London and you need at leats just in each of those cities. If you count the time flying in and out of Ireland lets face it you easily loose hald day at the airport you will just have day in Ireland. So if you dont mind being in a rush. Or skip Ireland and explore somewhere else near London or the Mont SaintMichel in France hours from Paris., Skip Ireland. Stick to London and Paris, youll have

# Define training procedures

In [17]:
# Step 2 :Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [18]:
# Step 3 :Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [19]:
# Step 4 :Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.54s/it]


In [20]:
# Step 5 :Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [21]:
# Step 6 :Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

In [22]:
project = "qlora_4-17_204pm"
base_model_name = "llama2"
run_name = base_model_name + "-" + project

# Number of training epochs
num_train_epochs = 30

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-4
#learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

evaluation_strategy = "steps"

eval_steps = 25

# Save checkpoint every X updates steps
save_steps = 50 

# Log every X updates steps
logging_steps = 25

## eval steps, save steps and logging steps
## original 25,50,25
## try 50,100,50

In [23]:
# Step 7 :Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    evaluation_strategy = evaluation_strategy,
    eval_steps=eval_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    report_to="wandb",
    #report_to="tensorboard",
    #run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

In [25]:
#%load_ext tensorboard

# old code without early stopping

# Step 8 :Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=transformed_dataset['train'],
    eval_dataset=transformed_dataset['valid'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# new code to early stop

In [24]:
from transformers import EarlyStoppingCallback
# Step 8 :Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=transformed_dataset['train'],
    eval_dataset=transformed_dataset['valid'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # this one is our first 5, previously 3
)

Map: 100%|██████████| 2214/2214 [00:00<00:00, 5326.03 examples/s]
Map: 100%|██████████| 123/123 [00:00<00:00, 5566.64 examples/s]


# Train

In [25]:
# Step 9 :Train model
trainer.train()

# Step 10 :Save trained model
trainer.model.save_pretrained(new_model)

[34m[1mwandb[0m: Currently logged in as: [33mchristopher-ton[0m ([33m298bwanderchat[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
25,2.7912,2.909741
50,3.0822,2.760384
75,2.5736,2.626777
100,2.604,2.59778
125,2.4833,2.546474
150,2.5647,2.545926
175,2.4667,2.514552
200,2.5734,2.516939
225,2.411,2.49438
250,2.475,2.500943




In [None]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [None]:
torch.cuda.empty_cache()
import gc
#del variables
gc.collect()

8972

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)




In [None]:
!nvidia-smi



Wed Apr  3 05:04:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              34W /  70W |  15005MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 203.06 MiB is free. Process 6694 has 14.55 GiB memory in use. Of the allocated memory 14.27 GiB is allocated by PyTorch, and 147.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)