# suno/bark on sagemaker(bring your own script)

https://huggingface.co/suno/bark

https://huggingface.co/blog/optimizing-bark

https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_inferenece_script_mode.html

# setting

In [1]:
!pip install -qU sagemaker boto3 bark scipy

In [2]:
%%time

import io
import os
import boto3
import sagemaker
import time
import shutil
import json

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here if you wish.
bucket = sagemaker.Session().default_bucket()
prefix = "bark"

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
CPU times: user 2.33 s, sys: 1.78 s, total: 4.11 s
Wall time: 1.72 s


# downloading model, upload to s3 (optional if you already downloaded)

In [None]:
#download s5cmd
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz
!chmod 777 s5cmd

#choose which BaiChuan2 model
repo_id="suno/bark"  #change this to yours
#local_dir='/tmp/'+repo_id.split("/")[-1] #absolute or relative directory
local_dir=repo_id

#download BaiChuan2 model file from Hugging Face
model_download_path = snapshot_download(repo_id=repo_id,local_dir=local_dir,ignore_patterns=["*.msgpack","*.h5"])
print(model_download_path)
!ls $local_dir

#upload model files to s3 bucket
!./s5cmd sync $local_dir/ s3://$bucket/$repo_id/
!aws s3 ls s3://$bucket/$repo_id/

# prepare code, and deploy to sagemaker

In [3]:
#prepare training files
source_dir='code'

if os.path.exists(source_dir):
    shutil.rmtree(source_dir)
!mkdir $source_dir

In [4]:
%%writefile code/requirements.txt
transformers
s3fs
nvgpu
pynvml

Writing code/requirements.txt


In [5]:
%%writefile code/inference.py
import json
import os
import torch
import s3fs

from transformers import AutoProcessor, BarkModel


model = None
processor = None
device = "cuda:0" if torch.cuda.is_available() else "cpu"

cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

model_local_path=f"/tmp/bark/"

def model_fn(model_dir):
    global processor
    """
    Deserialize and return fitted model.
    """
    print(f"model_dir: {model_dir}")
    
    fs = s3fs.S3FileSystem()
    model_s3 = os.environ.get("model_s3", "s3://internal-modelzoo-us-east-1/suno/bark/")

    print(f"need copy {model_s3} to {model_local_path}")
    os.makedirs(model_local_path)
    fs.get(model_s3,model_local_path, recursive=True)
    dir_list = os.listdir(model_local_path)
    print("Files and directories in '", model_local_path, "' :")
    print(dir_list)

    print("download completed")
    
    print("model_fn start")
    processor = AutoProcessor.from_pretrained(model_local_path,local_files_only=True)
    print("model_fn start - loaded AutoProcessor")
    model = BarkModel.from_pretrained(model_local_path,local_files_only=True).to(device)
    print("model_fn start - loaded BarkModel")
    return model


def predict_fn(input_data, model):
    global processor
    print("predict_fn start")
    if input_data is None:
        input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}

    if(model is None or processor is None):
        print("model is None or processor is None. Auto loading")
        processor = AutoProcessor.from_pretrained(model_local_path)
        model = BarkModel.from_pretrained(model_local_path).to(device)
        
    print("inputs start")
    inputs = processor(input_data["text"], voice_preset=input_data["voice_preset"]).to(device)
    
    print("model.generate start")
    audio_array = model.generate(**inputs)
    print("output start")
    output = audio_array.cpu().numpy().squeeze()
    return {"output":output}

def input_fn(request_body, request_content_type):
    print(f"input_fn start")
    input_data = json.loads(request_body)
    return input_data

Writing code/inference.py


In [6]:
framework_version = '2.1.0'
py_version = 'py310'

!touch dummy
!tar czvf model.tar.gz dummy
model_data = 's3://{0}/{1}/model.tar.gz'.format(bucket, 'bark')
!aws s3 cp model.tar.gz $model_data
!rm -f dummy model.tar.gz

dummy
upload: ./model.tar.gz to s3://sagemaker-us-east-1-432088571089/bark/model.tar.gz


In [7]:
from sagemaker.pytorch.model import PyTorchModel
model_s3 = "s3://internal-modelzoo-us-east-1/suno/bark/"
env = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'6000', 
    'SAGEMAKER_MODEL_SERVER_WORKERS': '2', 
    'MMS_MAX_RESPONSE_SIZE':'65535000',
    'TS_MAX_RESPONSE_SIZE':'65535000',
    'model_s3':model_s3,
}

pytorchModel = PyTorchModel(
    name = None,
    model_data = model_data,
    entry_point = 'inference.py',
    source_dir = "./code/",
    role = role,
    framework_version = framework_version, 
    py_version = py_version,
    env = env
)


In [8]:
%%time
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = pytorchModel.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",
    serializer = JSONSerializer(),
    deserializer = JSONDeserializer(),
)

--------!

# inference and verification

In [9]:
endpoint_name = predictor.endpoint_name
print(f"endpoint_name: {endpoint_name}") #copy this endpoint_name

endpoint_name: pytorch-inference-2024-02-28-05-55-27-685


In [10]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio

In [11]:
%%time
#test speaker 1
input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
predictions = predictor.predict(data = input_data)
Audio(predictions["output"], rate=SAMPLE_RATE)


CPU times: user 51.9 ms, sys: 1.45 ms, total: 53.4 ms
Wall time: 11.4 s


In [12]:
%%time
#test speaker 2
input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
predictions = predictor.predict(data = input_data)
Audio(predictions["output"], rate=SAMPLE_RATE)


CPU times: user 39.8 ms, sys: 10.1 ms, total: 49.9 ms
Wall time: 13.3 s


In [13]:
%%time
#test speaker 3
input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
predictions = predictor.predict(data = input_data)
Audio(predictions["output"], rate=SAMPLE_RATE)


CPU times: user 32.7 ms, sys: 3.8 ms, total: 36.5 ms
Wall time: 9.91 s


In [31]:
#only for re-invoke already-created endpoint by using Predictor
import sagemaker
import numpy as np
from sagemaker import Predictor, serializers, deserializers
sagemaker_session = sagemaker.Session()
endpoint_name = "pytorch-inference-2024-02-28-05-55-27-685" #copy from prior step

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)


In [32]:
%%time
input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
predictions = predictor.predict(data = input_data)
Audio(predictions["output"], rate=SAMPLE_RATE)

CPU times: user 65 ms, sys: 2.79 ms, total: 67.7 ms
Wall time: 17.3 s


In [41]:
%%time
#only for re-invoke already-created endpoint by using boto3 invoke_endpoint
import boto3,json

client = boto3.client('sagemaker-runtime')
endpoint_name = "pytorch-inference-2024-02-28-05-55-27-685" #copy from prior step
input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(input_data),
    ContentType='application/json',
    Accept='application/json'
)

output_json = json.loads(response["Body"].read().decode('utf-8'))
Audio(output_json["output"], rate=SAMPLE_RATE)

CPU times: user 44.1 ms, sys: 4.02 ms, total: 48.1 ms
Wall time: 7.68 s


# backup

for debug with catching exception information 

In [None]:

try:
    input_data = {"voice_preset":"v2/en_speaker_6","text":"Hello, this is the default text"}
    predictions = predictor.predict(data = input_data)
    
    from IPython.display import Audio

    Audio(predictions["output"], rate=SAMPLE_RATE)
except Exception as e:
    print(e)

