In [1]:
#!pip install "runhouse[aws]" torch datasets transformers peft trl
import runhouse as rh 
import os

#os.chdir("/dir/mydir")
from LoraFineTuner import FineTuner

INFO | 2024-08-09 19:49:00.718467 | Loaded Runhouse config from /Users/paulyang/.rh/config.yaml
  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  from cryptography.hazmat.primitives.ciphers.algorithms import TripleDES
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
INFO | 2024-08-09 19:49:02.760484 | PyTorch version 2.4.0 available.


## Connect to cluster

In [4]:
# Reqs will be installed by Runhouse on remote
# We can also show you how to launch with a Docker container / conda env 
img = rh.Image(name="ft_env").install_packages([
        "torch",
        "tensorboard",
        "scipy",
        "peft==0.4.0",
        "bitsandbytes==0.40.2",
        "transformers==4.31.0",
        "trl==0.4.7",
        "accelerate",
    ])

cluster = rh.compute(
    name="rh-a10x",
    instance_type="A10G:1",
    memory="32+",
    provider="aws",
    image = img 
).up_if_not()

# You will need a HF_TOKEN as an env variable to download the pretrained model in this example
cluster.sync_secrets(["huggingface"])


## Send fine tuner to remote and instantiate / Get already instantiated remote instance
* There is a locally defined LoraFineTuner class
* Runhouse will send this class to remote compute
* Then locally we create an *instance* of this remote class, which we name `rh_finetuner` or anything else
* We call this remote-instance from local as if it were normal/local, and can access it by name from any Python session connected to the cluster

In [7]:
fine_tuner_remote_name = "rh_finetuner" ## This is the name of the *instance* of the remote class, not the remote class

# We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class
fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True)

# If we have not, then we will send the local class to remote, and create an instance of it named "rh_finetuner"
# If you disconnect locally after calling tune, you can simply reconnect to the remote object using this block from another local session
if fine_tuner_remote is None:
    fine_tuner = rh.cls(FineTuner).to(
        cluster, env=env, name="llama3-medical-model"
    )
    fine_tuner_remote = fine_tuner(name=fine_tuner_remote_name)

In [8]:
## Once we have accessed the remote class, we can call against it as if it were a local object 
fine_tuner_remote.tune()

INFO | 2024-08-09 20:52:33.269630 | Calling rh_finetuner.tune


[36m
Map:   0%|          | 0/1700 [00:00<?, ? examples/s]
Map:  59%|█████▉    | 1000/1700 [00:00<00:00, 5934.16 examples/s][0m[36mft_env env: Calling method tune on module rh_finetuner
[0m[36mft_env env: Calling method training_params on module rh_finetuner
[0m[36mft_env env: Calling method sft_trainer on module rh_finetuner
[0m[36m
Map: 100%|██████████| 1700/1700 [00:00<00:00, 5553.61 examples/s]
Map: 100%|██████████| 1700/1700 [00:00<00:00, 5551.21 examples/s]
[0m[36m
Map:   0%|          | 0/300 [00:00<?, ? examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 5197.45 examples/s]
[0m[36m
  0%|          | 0/425 [00:00<?, ?it/s][0m[36m
  0%|          | 1/425 [00:06<43:30,  6.16s/it][0m[36m
  0%|          | 2/425 [00:12<43:20,  6.15s/it][0m[36m
  1%|          | 3/425 [00:16<38:25,  5.46s/it][0m[36m
  1%|          | 4/425 [00:20<33:05,  4.72s/it][0m[36m
  1%|          | 5/425 [00:23<28:56,  4.13s/it][0m[36m
  1%|▏         | 6/425 [00:26<25:17,  3.62s/it][0m[3

INFO | 2024-08-09 21:02:36.506223 | Time to call rh_finetuner.tune: 603.24 seconds


In [13]:
# Once the fine tuner is complete, we can query against it 
query = "What's the best treatment for sunburn?"
generated_text = fine_tuner_remote.generate(query, max_length = 1000)
print(generated_text)


INFO | 2024-08-09 14:41:07.557128 | Calling rh_finetuner.generate


[36mft_env env: Calling method generate on module rh_finetuner
[0m[36mft_env env: Calling method load_pipeline on module rh_finetuner
[0m

INFO | 2024-08-09 14:41:08.122288 | Time to call rh_finetuner.generate: 0.57 seconds


<|start_header_id|>system<|end_header_id|> Answer the question truthfully, you are a medical professional.<|eot_id|><|start_header_id|>user<|end_header_id|> This is the question: What's the best treatment for sunburn?<|eot_id|><|start_header_id|>assistant<|end_header_id|> The best treatment for sunburn is to prevent it.


## I can reconnect this remote instance of the fine tuner even after my local session disconnects, or from another session
If I connect to the cluster, and get the object by name, I can call against it even if my local session ends

You can run the below code at any point, including while training is running, from anywhere (like another notebook)

In [9]:
import runhouse as rh 

# We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class
cluster = rh.compute(name="/rh-alpha-testers/jamesb")
fine_tuner_remote_name = "rh_finetuner"
fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True)

# Check what the training status is on remote
if fine_tuner_remote is not None:
    print(fine_tuner_remote.get_training_status())

INFO | 2024-08-09 21:09:56.365787 | SSH tunnel on to server's port 32300 via server's ssh port 22 already created with the cluster.
INFO | 2024-08-09 21:09:57.109180 | Calling rh_finetuner.get_training_status


------
[36mjamesb[0m
------
[36mft_env env: Calling method get_training_status on module rh_finetuner
[0m[36mft_env env: Calling method new_model_exists on module rh_finetuner
[0m

INFO | 2024-08-09 21:09:57.634861 | Time to call rh_finetuner.get_training_status: 0.53 seconds


{'tokenizer_loaded': True, 'model_loaded': True, 'pipeline_loaded': True, 'training_completed': True, 'epochs_trained': 2, 'eval results': None}
