In [1]:
!nvidia-smi

Thu Sep 26 10:47:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:87:00.0 Off |                    0 |
| N/A   32C    P0              79W / 400W |      3MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000000:B7:00.0 Off |  

In [2]:
# import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [8]:
import torch

print("Cuda available: ", torch.cuda.is_available())
print("Number of GPUs: ", torch.cuda.device_count())

Cuda available:  True
Number of GPUs:  2


In [1]:
import gc
import json
import pandas as pd
# import numpy as np

from tqdm import tqdm
from random import sample 
from random import randint
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

In [2]:
dir_root = "Data/"

## Prepair Train

In [3]:
def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
        
### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""

In [4]:
def generate_prompt_context(data_point):
    return f"""[INST]Below is an instruction that describes a task! Write a response that appropriately completes the request. To answer the instruction, use the context if available!
### Instruction: 
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Context: 
{data_point["context"]}
[/INST]

### Response:
{data_point["output"]}
"""

In [5]:
def generate_prompt_llama2(data_point):
    return f"""[INST]Below is an instruction that describes a task! Write a response that appropriately completes the request. To answer the instruction, use the context if available!
### Instruction: 
{data_point["instruction"]}

### Input:
{data_point["input"]}
[/INST]

### Response:
{data_point["output"]}
"""

In [6]:
def train(model, version, data_files, gp):
    try:
        output_dir = f"Models/{version}/"
        
        tokenizer = AutoTokenizer.from_pretrained(model)
        tokenizer.pad_token_id = tokenizer.eos_token_id
        print("Loaded: TOKENIZER")
    
        data = load_dataset("json", data_files=data_files)
        # data = data.map(lambda data_point: {"prompt": tokenizer(gp(data_point))})    
        data = data.shuffle().map(
            lambda data_point: tokenizer(
                gp(data_point),
                truncation=True,
                max_length=256,
                padding="max_length",
            )
        )
        print("Loaded: DATA")
        
        model = AutoModelForCausalLM.from_pretrained(model, device_map="auto")
        print("Loaded: MODEL")

        # Enable gradient checkpointing to reduce memory usage
        model.gradient_checkpointing_enable()
        
        train_arguments = TrainingArguments(
            per_device_train_batch_size=4,
            gradient_accumulation_steps=16,
            warmup_steps=2,
            num_train_epochs=5, # paper uses 3
            learning_rate=2e-4,
            logging_steps=1,
            output_dir=output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=10,
            fp16=True)
    
        trainer = Trainer(
            model=model,
            train_dataset=data["train"],
            eval_dataset=data["test"],
            args=train_arguments,
            data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        )
        print("Loaded: TRAINER")
    
        
        model.config.use_cache = False
        trainer.train(resume_from_checkpoint=False)
    
        model.save_pretrained(f"{output_dir}endpoint")
        print("saved: MODEL")
        
        log_history = pd.DataFrame(trainer.state.log_history)
        log_history.to_csv(f"{dir_root}log_train_history_{version}.csv")
        print("saved: HISTORY")
    finally:
        model = None
        trainer = None
        train_arguments = None
        tokenizer = None
        data = None
        gc.collect()
        torch.cuda.empty_cache()

In [25]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

## GPJ-B

In [9]:
train(model = "EleutherAI/gpt-j-6b",
      version = "gptj-6b-v20240926",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_val.json"},
      gp = generate_prompt)



Loaded: TOKENIZER


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10247 [00:00<?, ? examples/s]

Map:   0%|          | 0/2268 [00:00<?, ? examples/s]

Loaded: DATA


Some weights of the model checkpoint at EleutherAI/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,1.1562,1.026613
1,0.7324,0.948013
2,0.2731,1.142371
3,0.1081,1.300829
4,0.0615,1.385511


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY


## GPJ-B with cotext

In [10]:
train(model = "EleutherAI/gpt-j-6B",
      version = "gptj-6b-context-v20231216",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_val_context.json"},
      gp = generate_prompt_context)

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]



Loaded: TOKENIZER


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20494 [00:00<?, ? examples/s]

Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

Loaded: DATA


config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.4295,0.759829
1,0.2679,0.7052
2,0.1011,0.81973
4,0.0518,0.94031


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY


## opt-6.7b

In [11]:
train(model = "facebook/opt-6.7b",
      version = "opt-6.7b-v20240926",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_val.json"},
      gp = generate_prompt)

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]



Loaded: TOKENIZER


Map:   0%|          | 0/10247 [00:00<?, ? examples/s]

Map:   0%|          | 0/2268 [00:00<?, ? examples/s]

Loaded: DATA


pytorch_model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,1.1082,1.18015
1,0.8731,1.002593
2,0.455,1.024832
3,0.1724,1.096023
4,0.0787,1.191012


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY


## opt-6.7b with context

In [None]:
train(model = "facebook/opt-6.7b",
      version = "opt-6.7b-context-v20240927",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_val_context.json"},
      gp = generate_prompt_context)



Loaded: TOKENIZER


Map:   0%|          | 0/20494 [00:00<?, ? examples/s]

Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

Loaded: DATA


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.6015,0.850359
1,0.3638,0.713798
2,0.1194,0.745095
4,0.0731,0.784782


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


## Llama-7b

In [13]:
train(model = "huggyllama/llama-7b",
      version = "llama-7b-v20240927",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_val.json"},
      gp = generate_prompt)

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,1.0003,1.032789


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [14]:
train(model = "huggyllama/llama-7b",
      version = "llama-7b-context-v20240927",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_val_context.json"},
      gp = generate_prompt_context)

Loaded: TOKENIZER


Map:   0%|          | 0/20494 [00:00<?, ? examples/s]

Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

Loaded: DATA


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded: MODEL


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Loaded: TRAINER


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.3353,0.589571
1,0.2165,0.551165
2,0.0894,0.632585
4,0.0505,0.759167


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY


## Llama-2-7b-hf

In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
train(model = "meta-llama/Llama-2-7b-hf",
      version = "llama-2-7b-hf-v20241001",
      data_files = {"train" : f"{dir_root}qaa_train.json", "test" : f"{dir_root}qaa_val.json"},
      gp = generate_prompt_llama2)

Loaded: TOKENIZER


Map:   0%|          | 0/10247 [00:00<?, ? examples/s]

Map:   0%|          | 0/2268 [00:00<?, ? examples/s]

Loaded: DATA


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.8349,0.820101
1,0.4976,0.782055
2,0.2037,0.930519
3,0.0817,1.061301
4,0.0516,1.182239


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY


NameError: name 'torch' is not defined

## Llama-2-7b-hf with context

In [22]:
train(model = "meta-llama/Llama-2-7b-hf",
      version = "llama-2-7b-hf-context-v20240927",
      data_files = {"train" : f"{dir_root}qaa_train_context.json", "test" : f"{dir_root}qaa_val_context.json"},
      gp = generate_prompt_context)

Loaded: TOKENIZER


Map:   0%|          | 0/20494 [00:00<?, ? examples/s]

Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

Loaded: DATA


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded: MODEL
Loaded: TRAINER


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,0.3483,0.608959
1,0.2237,0.563524
2,0.0912,0.640563
4,0.0498,0.776762


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


saved: MODEL
saved: HISTORY
