In [None]:
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

load_dotenv()

In [None]:
CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
CACHE_DIR = os.getenv("HF_HOME")



# Load model from blob storage

In [None]:
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)

print("\nDownloading blobs...")
container_client = blob_service_client.get_container_client("models")

# List the blobs in the container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print("\t" + blob.name)
    download_file_path = os.path.join(CACHE_DIR, blob.name)
    os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
    with open(download_file_path, mode="wb") as f:
        f.write(container_client.download_blob(blob.name).readall())
    print("success")
    print()

In [None]:
model_id = "oshizo/japanese-sexual-moderation-v2"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    problem_type="regression",
    cache_dir=CACHE_DIR,
)



In [None]:

texts = [
    "富士山は日本で一番高い山です。",
    "こんなになるまで我慢してたんだ...大変だったね",
    "俺は倒れる京子に近づき、担ぎ上げようと太ももの下に腕を通す。",
]

with torch.no_grad():
    encoding = tokenizer(texts, padding=True, max_length=64, return_tensors="pt")
    scores = model(**encoding).logits

print(scores)

# tensor([[-0.0045],
#         [ 0.2954],
#         [ 0.4391]])