In [1]:
#!pip install "runhouse[aws]" torch datasets transformers peft trl
import runhouse as rh 
import os

#os.chdir("/dir/mydir")
from LoraFineTuner import FineTuner

INFO | 2024-08-09 19:49:00.718467 | Loaded Runhouse config from /Users/paulyang/.rh/config.yaml
  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  from cryptography.hazmat.primitives.ciphers.algorithms import TripleDES
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
INFO | 2024-08-09 19:49:02.760484 | PyTorch version 2.4.0 available.


## Connect to cluster
**If you would like to run your own cluster instead:**
```python
# Run `sky check` to confirm AWS credentials are setup
cluster = rh.cluster(
    name="rh-a10x",
    instance_type="A10G:1",
    memory="32+",
    provider="aws",
).up_if_not()
```

In [10]:
cluster = rh.cluster(name="/rh-alpha-testers/jamesb")

# You will need a HF_TOKEN as an env variable to download the pretrained model in this example
# Reqs will be installed by Runhouse on remote
# We can also show you how to launch with a Docker container / conda env 
env = rh.env(
    name="ft_env",
    reqs=[
        "torch",
        "tensorboard",
        "scipy",
        "peft==0.4.0",
        "bitsandbytes==0.40.2",
        "transformers==4.31.0",
        "trl==0.4.7",
        "accelerate",
    ],
    secrets=["huggingface"]
)

## Send fine tuner to remote and instantiate / Get already instantiated remote instance
* There is a locally defined LoraFineTuner class
* Runhouse will send this class to remote compute
* Then locally we create an *instance* of this remote class, which we name `rh_finetuner` or anything else
* We call this remote-instance from local as if it were normal/local, and can access it by name from any Python session connected to the cluster

In [11]:
fine_tuner_remote_name = "rh_finetuner" ## This is the name of the *instance* of the remote class, not the remote class

# We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class
fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True)

# If we have not, then we will send the local class to remote, and create an instance of it named "rh_finetuner"
# If you disconnect locally after calling tune, you can simply reconnect to the remote object using this block from another local session
if fine_tuner_remote is None:
    fine_tuner = rh.module(FineTuner).to(
        cluster, env=env, name="llama3-medical-model"
    )
    fine_tuner_remote = fine_tuner(name=fine_tuner_remote_name)

INFO | 2024-08-09 14:40:31.580499 | Running forwarding command: ssh -T -L 32300:localhost:32300 -i ~/.ssh/sky-key -o Port=22 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o IdentitiesOnly=yes -o ExitOnForwardFailure=yes -o ServerAliveInterval=5 -o ServerAliveCountMax=3 -o ConnectTimeout=30s -o ForwardAgent=yes -o ControlMaster=auto -o ControlPath=/tmp/skypilot_ssh_6ba66db0/801cb87b25/%C -o ControlPersist=300s ubuntu@3.95.222.246


In [16]:
## Once we have accessed the remote class, we can call against it as if it were a local object 
fine_tuner_remote.tune()

INFO | 2024-08-09 14:49:08.678207 | Calling rh_finetuner.tune


[36mft_env env: Calling method tune on module rh_finetuner
[0m[36mft_env env: Calling method new_model_exists on module rh_finetuner
[0m

INFO | 2024-08-09 14:49:09.238983 | Time to call rh_finetuner.tune: 0.56 seconds


In [13]:
# Once the fine tuner is complete, we can query against it 
query = "What's the best treatment for sunburn?"
generated_text = fine_tuner_remote.generate(query, max_length = 1000)
print(generated_text)


INFO | 2024-08-09 14:41:07.557128 | Calling rh_finetuner.generate


[36mft_env env: Calling method generate on module rh_finetuner
[0m[36mft_env env: Calling method load_pipeline on module rh_finetuner
[0m

INFO | 2024-08-09 14:41:08.122288 | Time to call rh_finetuner.generate: 0.57 seconds


<|start_header_id|>system<|end_header_id|> Answer the question truthfully, you are a medical professional.<|eot_id|><|start_header_id|>user<|end_header_id|> This is the question: What's the best treatment for sunburn?<|eot_id|><|start_header_id|>assistant<|end_header_id|> The best treatment for sunburn is to prevent it.


## I can reconnect this remote instance of the fine tuner even after my local session disconnects, or from another session
If I connect to the cluster, and get the object by name, I can call against it even if my local session ends

You can run the below code at any point, including while training is running, from anywhere (like another notebook)

In [18]:
import runhouse as rh 

# We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class
cluster = rh.cluster(name="/paul/rh-a10x")
fine_tuner_remote_name = "rh_finetuner"
fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True)

# Check what the training status is on remote
if fine_tuner_remote is not None:
    print(fine_tuner_remote.get_training_status())

INFO | 2024-08-09 14:58:13.149741 | SSH tunnel on to server's port 32300 via server's ssh port 22 already created with the cluster.
INFO | 2024-08-09 14:58:13.214864 | Calling rh_finetuner.get_training_status


-------
[36mrh-a10x[0m
-------
[36mft_env env: Calling method get_training_status on module rh_finetuner
[0m[36mft_env env: Calling method new_model_exists on module rh_finetuner
[0m

INFO | 2024-08-09 14:58:13.738335 | Time to call rh_finetuner.get_training_status: 0.52 seconds


{'base_model_loaded': False, 'tokenizer_loaded': True, 'fine_tuned_model_loaded': True, 'pipeline_loaded': True, 'training_completed': True}
