# Downloading and uploading the model to fine tune

In [12]:
!pip install huggingface_hub --upgrade --quiet
!pip install "transformers==4.30.2" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet

[0m

If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.



In [14]:
#required to work in local_mode on your notebook instance for development/debugging purpose
#!pip install 'sagemaker[local]' --upgrade --quiet
#!pip install docker-compose --quiet

In [27]:
import sagemaker
import boto3
import os

#uncomment to run in local mode
#from sagemaker import LocalSession
#sess = LocalSession()
#the below help setting up the container's root on the EBS volume of your instance.
#sess.config = {'local' : {'local_code' : True, 'container_root' : '/home/ec2-user/SageMaker/'}}
#if you're running local mode and run into out of space issues, consider running docker_scripts/prepare-docker.sh to set the docker root under /home/ec2-user/SageMaker

sess = sagemaker.Session()
region = sess.boto_region_name

#replace the below by a specific bucket if you need
sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
s3_client = boto3.client("s3")
s3_prefix = "model-fine-tuning"

#local notebook path
notebook_home = "/home/ec2-user/SageMaker/"

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {region}")

sagemaker role arn: arn:aws:iam::327216439222:role/Sagemaker
sagemaker bucket: sagemaker-us-east-1-327216439222
sagemaker session region: us-east-1


## Download and upload the model to S3

In [16]:
model_id = "tiiuae/falcon-7b"
model_name = model_id.split("/")[-1]

In [17]:
!pip show huggingface_hub

[0mName: huggingface-hub
Version: 0.16.4
Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
Home-page: https://github.com/huggingface/huggingface_hub
Author: Hugging Face, Inc.
Author-email: julien@huggingface.co
License: Apache
Location: /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages
Requires: filelock, fsspec, packaging, pyyaml, requests, tqdm, typing-extensions
Required-by: datasets, evaluate, transformers


In [18]:
from pathlib import Path
from huggingface_hub import snapshot_download

model_tar_dir = Path(os.path.join(notebook_home, "models", model_name))
if not os.path.isdir(model_tar_dir):
    os.makedirs(model_tar_dir)

# Download model from Hugging Face into model_dir
snapshot_download(model_id, 
                  local_dir=str(model_tar_dir), 
                  local_dir_use_symlinks=False,
                  cache_dir="/home/ec2-user/SageMaker/models/tmp")


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading (…)f25d4eb1/config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

Downloading (…)49f25d4eb1/README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading (…)d4eb1/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)/configuration_RW.py:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

Downloading (…)4eb1/modelling_RW.py:   0%|          | 0.00/47.6k [00:00<?, ?B/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Downloading (…)d4eb1/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

'/home/ec2-user/SageMaker/models/falcon-7b'

In [23]:
print(model_tar_dir)

/home/ec2-user/SageMaker/models/falcon-7b


In [24]:
cwd = str(Path.cwd())
p = Path(os.path.join(Path.cwd(), model_tar_dir))
mydirs = list(p.glob('**'))

In [25]:
#uploading the model and its multiple files to S3
def upload_to_s3(model_tar_dir, s3_prefix, sagemaker_session_bucket):
    stop_list = ['.ipynb_checkpoints', '.gitattributes']
    files = os.listdir(model_tar_dir)   
    for file in files:
        if file not in stop_list:
            try:
                local_path = os.path.join(model_tar_dir, file)
                if os.path.isfile(local_path):
                    remote_path = os.path.join(s3_prefix, file)
                    s3_client.upload_file(local_path, sagemaker_session_bucket, remote_path)
                    print(f"{local_path} uploaded to s3 folder: {remote_path}")
                else:
                    new_local_dir = os.path.join(model_tar_dir,file)
                    new_remote_dir = os.path.join(s3_prefix,file)
                    upload_to_s3(new_local_dir, new_remote_dir, sagemaker_session_bucket)

            except Exception as e:
                print(e)

In [28]:
upload_to_s3(model_tar_dir, os.path.join(s3_prefix, "models", model_name, ''), sagemaker_session_bucket)

/home/ec2-user/SageMaker/models/falcon-7b/pytorch_model-00001-of-00002.bin uploaded to s3 folder: model-fine-tuning/models/falcon-7b/pytorch_model-00001-of-00002.bin
/home/ec2-user/SageMaker/models/falcon-7b/pytorch_model.bin.index.json uploaded to s3 folder: model-fine-tuning/models/falcon-7b/pytorch_model.bin.index.json
/home/ec2-user/SageMaker/models/falcon-7b/pytorch_model-00002-of-00002.bin uploaded to s3 folder: model-fine-tuning/models/falcon-7b/pytorch_model-00002-of-00002.bin
/home/ec2-user/SageMaker/models/falcon-7b/config.json uploaded to s3 folder: model-fine-tuning/models/falcon-7b/config.json
/home/ec2-user/SageMaker/models/falcon-7b/README.md uploaded to s3 folder: model-fine-tuning/models/falcon-7b/README.md
/home/ec2-user/SageMaker/models/falcon-7b/configuration_RW.py uploaded to s3 folder: model-fine-tuning/models/falcon-7b/configuration_RW.py
/home/ec2-user/SageMaker/models/falcon-7b/modelling_RW.py uploaded to s3 folder: model-fine-tuning/models/falcon-7b/modelling_

In [29]:
#storing model path and output model path to reuse later
model_path = os.path.join("s3://", sagemaker_session_bucket, s3_prefix, "models", model_name, '')
print(model_path)

s3://sagemaker-us-east-1-327216439222/model-fine-tuning/models/falcon-7b/


In [30]:
%store model_path
%store model_name
%store model_id

Stored 'model_path' (str)
Stored 'model_name' (str)
Stored 'model_id' (str)
