In [1]:
import subprocess

def run_nvidia_smi():
    try:
        # Run the nvidia-smi command
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Check if the command was successful
        if result.returncode == 0:
            print("nvidia-smi output:\n")
            print(result.stdout)
        else:
            print("Error running nvidia-smi:\n")
            print(result.stderr)

    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function to run nvidia-smi
run_nvidia_smi()

nvidia-smi output:

Fri Nov  8 21:08:51 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     On  |   00000000:17:00.0 Off |                    0 |
|  0%   52C    P0             85W /  300W |    3717MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A40           

In [2]:
import os
# Specify the GPU index you want to check
gpu_index = 3
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_index}"
# Check if CUDA is available
import torch

import pickle

import random
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    pipeline,
)
from huggingface_hub import login
from peft import LoraConfig, get_peft_model
import time
from tqdm import tqdm
import json
#torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")
2024-11-08 21:08:54.685485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-08 21:08:54.840605: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 21:08:55.450512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2

In [3]:
# Check if CUDA is available
if torch.cuda.is_available():
    num_devices = torch.cuda.device_count()
    print(f"Number of available CUDA devices: {num_devices}")
    
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        print(f"\nDevice {i}: {device_name}")
else:
    print("CUDA is not available.")

Number of available CUDA devices: 1

Device 0: NVIDIA A40


In [None]:
MODEL_ID = "meta-llama/Llama-3.1-8B"
MODEL_NAME = MODEL_ID.split("/")[1]
TOKEN = ''
login(token=TOKEN)

CHECKPOINT_PATH = "Llama-3.1-8B_1" # Define lllama model we pretrained on full corpus
training_epochs = 3
model_dir = f"{MODEL_NAME}_instruct_{training_epochs}"

In [5]:
model_dir

'Llama-3.1-8B_instruct_3'

In [6]:
# Configuration for the 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [7]:
# Load model and tokenizer with quantization settings
if not CHECKPOINT_PATH:
    model_q = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map='auto',
        attn_implementation="eager"
    )
else:
    model_q = AutoModelForCausalLM.from_pretrained(
        CHECKPOINT_PATH,
        quantization_config=bnb_config,
        device_map='auto',
        attn_implementation="eager"
    )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:04<00:00,  1.21s/it]


In [8]:
# Apply LoRA for efficient fine-tuning
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", 'k_proj', 'v_proj', 'o_proj']
)
model_q = get_peft_model(model_q, peft_config)

In [9]:
def load_json_to_dict(json_file: str) -> dict:
    """
    Load a JSON file into a dictionary.

    Args:
        json_file (str): The path to the JSON file.

    Returns:
        dict: Dictionary representation of the JSON data.
    """
    json_path = os.path.join(json_file)
    with open(json_path, "r") as f:
        data = json.load(f)
    
    return data

consolidated_data = load_json_to_dict("consolidated.json")

In [10]:
consolidated_data[14000]

{'instruction': 'Expand the following acronym into its full form: {}',
 'input': 'DDP',
 'output': 'Data Download Program'}

In [11]:
len(consolidated_data)

16615

In [12]:
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Answer:
{}
"""

In [13]:
# Assign eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token

# Format examples and tokenize
def format_prompt(entry, print_example=False):
    # Populate the instruction template with entry input
    instruction = entry['instruction'].format(entry['input'])
    answer = entry['output']
    # Apply the full prompt template
    context = prompt_template.format(instruction, answer)

    # Optionally print for debugging or verification
    if print_example:
        print(context)
        
    # Tokenize with padding and truncation
    encoded = tokenizer(
        tokenizer.bos_token + context,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=128
    )
    
    return {
        'input_ids': encoded['input_ids'][0],
        'attention_mask': encoded['attention_mask'][0]
    }

# Create dataset by applying the format function to each entry
formatted_dataset = [format_prompt(entry) for entry in consolidated_data]

In [14]:
format_prompt(consolidated_data[7895], print_example=True)  #12345

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide a concise answer to the following question: How did the Federal Reserve's actions in 2003 and early 2004 reflect compliance with the Federal Reserve Act, particularly in terms of reporting to Congress?

### Answer:
The Federal Reserve submitted its Monetary Policy Report to Congress pursuant to section 2B of the Federal Reserve Act, ensuring compliance with legislative requirements for transparency and accountability regarding monetary policy decisions.



{'input_ids': tensor([128000, 128000,  39314,    374,    459,   7754,    430,  16964,    264,
           3465,     13,   9842,    264,   2077,    430,  36001,  45695,    279,
           1715,    382,  14711,  30151,    512,  61524,    264,  64694,   4320,
            311,    279,   2768,   3488,     25,   2650,   1550,    279,  12411,
          25820,    596,   6299,    304,    220,   1049,     18,    323,   4216,
            220,   1049,     19,   8881,   8907,    449,    279,  12411,  25820,
           3298,     11,   8104,    304,   3878,    315,  13122,    311,   8151,
           1980,  14711,  22559,    512,    791,  12411,  25820,  14976,   1202,
          74214,  11216,   8423,    311,   8151,  33549,    311,   3857,    220,
             17,     33,    315,    279,  12411,  25820,   3298,     11,  23391,
           8907,    449,  27743,   8670,    369,  28330,    323,  39242,   9002,
          33384,   4947,  11429,    627, 128001, 128001, 128001, 128001, 128001,
         128001

In [15]:
# Define custom dataset class for the trainer
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        values = self.data[idx]
        labels = values['input_ids'].clone()
        # Mask instruction tokens
        labels[values['attention_mask'] == 0] = -100
        return {
            'input_ids': values['input_ids'],
            'attention_mask': values['attention_mask'],
            'labels': labels
        }

In [16]:
len(formatted_dataset)

16615

In [17]:
# Split into training and validation datasets
random.seed(42)
random.shuffle(formatted_dataset)
train_examples = int(0.95 * len(formatted_dataset))

# Pass the device to the CustomDataset constructor
train_dataset = CustomDataset(formatted_dataset[:train_examples])
val_dataset = CustomDataset(formatted_dataset[train_examples:])

# Set up the trainer with causal language modeling and modified arguments
trainer = Trainer(
    model=model_q,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=14,
        per_device_eval_batch_size=8,
        eval_strategy='steps',
        eval_steps=50,
        gradient_accumulation_steps=4,
        optim="adamw_8bit",
        warmup_steps=10,
        num_train_epochs=training_epochs,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        output_dir=model_dir,
    )
)

In [18]:
# Train the model
start_time = time.time()
trainer.train()
total_time = time.time() - start_time
# Print the total processing time
print(f"Total processing time: {total_time:.2f} seconds")

# Save the fine-tuned model and tokenizer
model_q.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

Step,Training Loss,Validation Loss
50,0.8788,0.729234
100,0.6717,0.660641
150,0.6563,0.637586
200,0.6233,0.622174
250,0.6209,0.615431
300,0.5761,0.607351
350,0.5794,0.601065
400,0.5367,0.596197
450,0.5709,0.592224
500,0.5845,0.590864


Total processing time: 3564.77 seconds


('Llama-3.1-8B_instruct_3/tokenizer_config.json',
 'Llama-3.1-8B_instruct_3/special_tokens_map.json',
 'Llama-3.1-8B_instruct_3/tokenizer.json')

In [18]:
model_ft = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto')
tokenizer_ft = AutoTokenizer.from_pretrained(model_dir)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:09<00:00,  2.28s/it]


In [22]:
import logging
logging.getLogger("transformers").setLevel(logging.CRITICAL)
# Set up the text generation pipeline
generator = pipeline("text-generation", model=model_q, tokenizer=tokenizer)

# Simulated conversation
user_input = "Expand the following acronym into its full form: ESMA."

#user_input = "Provide a link for Credit Card Accountability Responsibility and Disclosure Act law."

#user_input =  "Define the following term: National Automated Clearing House Association."

prompt = prompt_template.format(user_input, '')
# Generate a response
response = generator(prompt, max_length=200, num_return_sequences=1, do_sample=True)
response_str = response[0]['generated_text'].split('### Answer:')[1].strip()
cut_ind = response_str.find("#")
response_str = response_str[:cut_ind].strip() if cut_ind!=-1 else response_str
print("AI:", response_str)

AI: The full form of the acronym ESMA is the **Environmental Safety and Monitoring Agency**.


In [23]:
# Set up the text generation pipeline
generator = pipeline("text-generation", model=model_ft, tokenizer=tokenizer_ft)
# Simulated conversation
user_input = "Expand the following acronym into its full form: ESMA."

#user_input = "Provide a link for Credit Card Accountability Responsibility and Disclosure Act law."

#user_input =  "Define the following term: National Automated Clearing House Association."

prompt = prompt_template.format(user_input, '')#f"User: {user_input}\nAI:"
# Generate a response
response = generator(prompt, max_length=200, num_return_sequences=1, do_sample=True)

# Display the AI's response
response_str = response[0]['generated_text'].split('### Answer:')[1].strip()
cut_ind = response_str.find("#")
response_str = response_str[:cut_ind].strip() if cut_ind!=-1 else response_str
print("AI:", response_str)

AI: European Securities and Markets Authority


In [19]:
model_ft.push_to_hub("smartinez1/Llama-3.1-8B-FINLLM")

adapter_model.safetensors: 100%|███████████| 54.6M/54.6M [00:03<00:00, 16.4MB/s]


CommitInfo(commit_url='https://huggingface.co/smartinez1/Llama-3.1-8B-FINLLM/commit/e2bab10efff80a8019f489d02016811a2ac786d6', commit_message='Upload LlamaForCausalLM', commit_description='', oid='e2bab10efff80a8019f489d02016811a2ac786d6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/smartinez1/Llama-3.1-8B-FINLLM', endpoint='https://huggingface.co', repo_type='model', repo_id='smartinez1/Llama-3.1-8B-FINLLM'), pr_revision=None, pr_num=None)