In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import os
import time
import boto3
import botocore
from pathlib import Path

In [3]:
#Setup variables for this script

In [4]:
root_models_path = './models'
model_prefix = "ibm-granite"
model_name = "granite-3.1-8b-instruct"
full_model_name = f"{model_prefix}/{model_name}"

In [5]:
#Download model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(full_model_name)
model = AutoModelForCausalLM.from_pretrained(full_model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(full_model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:  83%|########3 | 4.14G/4.97G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
#Perform a quick inference test on the model

In [7]:
messages = [
    {"role": "user", "content": "How old is the universe?"}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
#outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The current estimate for the age of the universe, based on data from the Planck satellite and other observations, is approximately 13.8 billion years. This value is derived from measurements of the cosmic microwave background radiation, which is the afterglow of the Big Bang. The age of the universe is a crucial piece of information in our understanding of cosmology and the evolution of the cosmos.


In [None]:
#Save the model and its tokenizer to a local folder

In [8]:
save_path = Path.home().joinpath(root_models_path, model_name)
!mkdir -p {save_path}
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
!ls {save_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


added_tokens.json		  model-00004-of-00004.safetensors
config.json			  model.safetensors.index.json
generation_config.json		  special_tokens_map.json
merges.txt			  tokenizer_config.json
model-00001-of-00004.safetensors  tokenizer.json
model-00002-of-00004.safetensors  vocab.json
model-00003-of-00004.safetensors


In [None]:
#Get the s3 connection parameters (these are entered in the workbench setup screens)

In [9]:
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

In [10]:
session = boto3.session.Session(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
          )

In [11]:
s3_resource = session.resource(
               's3',
               config=botocore.client.Config(signature_version='s3v4'),
               endpoint_url=endpoint_url,
               region_name=region_name
              )
bucket = s3_resource.Bucket(bucket_name)
                        
def upload_directory_to_s3(local_directory, s3_prefix):
    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(file_path, local_directory)
            s3_key = os.path.join(s3_prefix, relative_path)
            print(f"{file_path} -> {s3_key}")
            bucket.upload_file(file_path, s3_key)
                                
def list_objects(prefix):
    filter = bucket.objects.filter(Prefix=prefix)
    for obj in filter.all():
        print(obj.key)

In [12]:
#Upload the model to an S3 bucket within the cluster

In [16]:
full_model_name

'ibm-granite/granite-3.1-8b-instruct'

In [18]:
upload_directory_to_s3(save_path, model_name)

/opt/app-root/src/models/granite-3.1-8b-instruct/merges.txt -> granite-3.1-8b-instruct/merges.txt
/opt/app-root/src/models/granite-3.1-8b-instruct/model.safetensors.index.json -> granite-3.1-8b-instruct/model.safetensors.index.json
/opt/app-root/src/models/granite-3.1-8b-instruct/tokenizer_config.json -> granite-3.1-8b-instruct/tokenizer_config.json
/opt/app-root/src/models/granite-3.1-8b-instruct/model-00004-of-00004.safetensors -> granite-3.1-8b-instruct/model-00004-of-00004.safetensors
/opt/app-root/src/models/granite-3.1-8b-instruct/model-00002-of-00004.safetensors -> granite-3.1-8b-instruct/model-00002-of-00004.safetensors
/opt/app-root/src/models/granite-3.1-8b-instruct/special_tokens_map.json -> granite-3.1-8b-instruct/special_tokens_map.json
/opt/app-root/src/models/granite-3.1-8b-instruct/model-00003-of-00004.safetensors -> granite-3.1-8b-instruct/model-00003-of-00004.safetensors
/opt/app-root/src/models/granite-3.1-8b-instruct/tokenizer.json -> granite-3.1-8b-instruct/tokeniz