In [1]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import os
import time
import boto3
import botocore
from pathlib import Path
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
login_token = os.environ.get('huggingface_token')
login(token=login_token)

In [3]:
#Setup variables for this script

In [4]:
root_models_path = './models'
model_prefix = "ibm-granite"
model_name = "granite-embedding-278m-multilingual"
full_model_name = f"{model_prefix}/{model_name}"
save_path = Path.home().joinpath(root_models_path, model_name)

In [5]:
#Download model

In [6]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id=full_model_name, local_dir=save_path)

Fetching 12 files: 100%|██████████| 12/12 [00:03<00:00,  3.01it/s]


'/opt/app-root/src/models/granite-embedding-278m-multilingual'

In [7]:
#Get the s3 connection parameters (these are entered in the workbench setup screens)

In [8]:
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

In [9]:
session = boto3.session.Session(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
          )

In [10]:
s3_resource = session.resource(
               's3',
               config=botocore.client.Config(signature_version='s3v4'),
               endpoint_url=endpoint_url,
               region_name=region_name
              )
bucket = s3_resource.Bucket(bucket_name)
                        
def upload_directory_to_s3(local_directory, s3_prefix):
    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(file_path, local_directory)
            s3_key = os.path.join(s3_prefix, relative_path)
            print(f"{file_path} -> {s3_key}")
            bucket.upload_file(file_path, s3_key)
                                
def list_objects(prefix):
    filter = bucket.objects.filter(Prefix=prefix)
    for obj in filter.all():
        print(obj.key)

In [11]:
#Upload the model to an S3 bucket within the cluster

In [12]:
upload_directory_to_s3(save_path, model_name)

/opt/app-root/src/models/granite-embedding-278m-multilingual/README.md -> granite-embedding-278m-multilingual/README.md
/opt/app-root/src/models/granite-embedding-278m-multilingual/tokenizer.json -> granite-embedding-278m-multilingual/tokenizer.json
/opt/app-root/src/models/granite-embedding-278m-multilingual/pytorch_model.bin -> granite-embedding-278m-multilingual/pytorch_model.bin
/opt/app-root/src/models/granite-embedding-278m-multilingual/sentencepiece.bpe.model -> granite-embedding-278m-multilingual/sentencepiece.bpe.model
/opt/app-root/src/models/granite-embedding-278m-multilingual/.gitattributes -> granite-embedding-278m-multilingual/.gitattributes
/opt/app-root/src/models/granite-embedding-278m-multilingual/tokenizer_config.json -> granite-embedding-278m-multilingual/tokenizer_config.json
/opt/app-root/src/models/granite-embedding-278m-multilingual/model.safetensors -> granite-embedding-278m-multilingual/model.safetensors
/opt/app-root/src/models/granite-embedding-278m-multilin