In [1]:
import os
from os.path import join as pjoin, exists as pexists
import shutil

import torch
from transformers import (
    WhisperProcessor, WhisperForConditionalGeneration,
    Speech2TextProcessor, Speech2TextForConditionalGeneration
)


KeyboardInterrupt: 

In [None]:
root = './ms_2'
models_root = pjoin(root, 'models')
os.makedirs(models_root, exist_ok=True)

a2t_model_names = [
    # "AventIQ-AI/whisper-audio-to-text",
    # "facebook/s2t-small-librispeech-asr"

    # "facebook/wav2vec2-base-960h",
    # "superb/hubert-base-superb-asr",
    # "microsoft/wavlm-base-plus",
    
    "facebook/wav2vec2-base-960h",
    # "facebook/s2t-small-librispeech-asr",
]

t2s_model_names = [
    "distilbert-base-uncased-finetuned-sst-2-english",
    "siebert/sentiment-roberta-large-english",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
]



### Export audio to text models

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxconverter_common import float16
import onnx

import torch, os

model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

dummy = torch.randn(1, 16000)  # 1s of 16kHz audio

out_p = f"{models_root}/a2t_{model_name.replace('/', '_')}_fp32.onnx"

if not pexists(out_p):
    torch.onnx.export(
        model,
        dummy,
        out_p,
        input_names=["input_values"],
        output_names=["logits"],
        dynamic_axes={"input_values": {0: "batch", 1: "sequence"}},
        opset_version=14
    )

print(f"DONE: Saved model (fp32) {model_name} at {out_p}")

out_path_fp16 = f"{models_root}/t2s_{model_name.replace('/', '_')}_fp16.onnx"
if not pexists(out_path_fp16):
    model_fp32 = onnx.load(out_p)
    model_fp16 = float16.convert_float_to_float16(model_fp32)
    onnx.save(model_fp16, out_path_fp16)
print(f"DONE: Saved model (fp16) {model_name} at {out_path_fp16}")


out_path_int8 = f"{models_root}/t2s_{model_name.replace('/', '_')}_int8.onnx"
if not pexists(out_path_int8):
    quantize_dynamic(
        out_p,
        out_path_int8,
        weight_type=QuantType.QUInt8,
    )
print(f"DONE: Saved model (int8) {model_name} at {out_path_int8}")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DONE: Saved model (fp32) facebook/wav2vec2-base-960h at ./ms_2\models/a2t#facebook_wav2vec2-base-960h#fp32.onnx




DONE: Saved model (fp16) facebook/wav2vec2-base-960h at ./ms_2\models/t2s#facebook_wav2vec2-base-960h#fp16.onnx
DONE: Saved model (int8) facebook/wav2vec2-base-960h at ./ms_2\models/t2s#facebook_wav2vec2-base-960h#int8.onnx


In [None]:
from optimum.exporters.onnx import main_export
from pathlib import Path

model_name = "facebook/s2t-small-librispeech-asr"
output_dir = pjoin(f"{models_root}/a2t_{model_name.replace('/', '_')}_fp32")

if not pexists(output_dir):
    os.makedirs(output_dir, exist_ok=False)
    main_export(
        model_name_or_path=model_name,
        output=output_dir,
        task="automatic-speech-recognition",
    )
print(f"DONE: Saved model {model_name} at {out_p}")

# from onnxruntime.quantization import quantize_dynamic, QuantType

# for quant in ['']
# quantize_dir = pjoin(f"{models_root}/a2t_{model_name.replace('/', '_')}_int8")
# if not pexists(quantize_dir):
#     shutil.copytree(output_dir, quantize_dir)
#     quantize_dynamic(
#         f"{quantize_dir}/encoder_model.onnx",
#         f"{quantize_dir}/encoder_model.onnx",
#         weight_type=QuantType.QUInt8
#     )
    
#     quantize_dynamic(
#         f"{quantize_dir}/decoder_model.onnx",
#         f"{quantize_dir}/decoder_model.onnx",
#         weight_type=QuantType.QUInt8
#     )

DONE: Saved model facebook/s2t-small-librispeech-asr at ./ms_2\models/a2t#superb_hubert-base-superb-asr#fp32.onnx


### Export text to sentiment models

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxconverter_common import float16
import onnx

def export_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.eval()

    dummy_inputs = tokenizer("I love this movie!", return_tensors="pt")

    out_path = f"{models_root}/t2s_{model_name.replace('/', '_')}_fp32.onnx"

    if not pexists(out_path):
        torch.onnx.export(
            model,
            (dummy_inputs["input_ids"], dummy_inputs["attention_mask"]),
            out_path,
            input_names=["input_ids", "attention_mask"],
            output_names=["logits"],
            opset_version=14,
            dynamic_axes={
                "input_ids": {0: "batch_size", 1: "sequence"},
                "attention_mask": {0: "batch_size", 1: "sequence"},
            },
        )
    print(f"DONE: Saved model (fp32) {model_name} at {out_p}")

    
    out_path_fp16 = f"{models_root}/t2s_{model_name.replace('/', '_')}_fp16.onnx"
    if not pexists(out_path_fp16):
        model_fp32 = onnx.load(out_path)
        model_fp16 = float16.convert_float_to_float16(model_fp32)
        onnx.save(model_fp16, out_path_fp16)
    print(f"DONE: Saved model (fp16) {model_name} at {out_path_fp16}")


    out_path_int8 = f"{models_root}/t2s_{model_name.replace('/', '_')}_int8.onnx"
    if not pexists(out_path_int8):
        quantize_dynamic(
            out_path,
            out_path_int8,
            weight_type=QuantType.QUInt8,
        )
    print(f"DONE: Saved model (int8) {model_name} at {out_path_int8}")


for m in t2s_model_names:
    export_model(m)

DONE: Saved model (fp32) distilbert-base-uncased-finetuned-sst-2-english at ./ms_2\models/a2t#facebook_wav2vec2-base-960h#fp32.onnx
DONE: Saved model (fp16) distilbert-base-uncased-finetuned-sst-2-english at ./ms_2\models/t2s#distilbert-base-uncased-finetuned-sst-2-english#fp16.onnx
DONE: Saved model (int8) distilbert-base-uncased-finetuned-sst-2-english at ./ms_2\models/t2s#distilbert-base-uncased-finetuned-sst-2-english#int8.onnx
DONE: Saved model (fp32) siebert/sentiment-roberta-large-english at ./ms_2\models/a2t#facebook_wav2vec2-base-960h#fp32.onnx
DONE: Saved model (fp16) siebert/sentiment-roberta-large-english at ./ms_2\models/t2s#siebert_sentiment-roberta-large-english#fp16.onnx
DONE: Saved model (int8) siebert/sentiment-roberta-large-english at ./ms_2\models/t2s#siebert_sentiment-roberta-large-english#int8.onnx


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DONE: Saved model (fp32) cardiffnlp/twitter-roberta-base-sentiment-latest at ./ms_2\models/a2t#facebook_wav2vec2-base-960h#fp32.onnx
DONE: Saved model (fp16) cardiffnlp/twitter-roberta-base-sentiment-latest at ./ms_2\models/t2s#cardiffnlp_twitter-roberta-base-sentiment-latest#fp16.onnx
DONE: Saved model (int8) cardiffnlp/twitter-roberta-base-sentiment-latest at ./ms_2\models/t2s#cardiffnlp_twitter-roberta-base-sentiment-latest#int8.onnx


In [15]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, pipeline

from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np
import torch

# 1️⃣ 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# 2️⃣ 载入 onnx 模型
t2s_model_p = "./ms_2/models/t2s#distilbert-base-uncased-finetuned-sst-2-english#int8.onnx"
session = ort.InferenceSession(t2s_model_p)

# 3️⃣ 文本转输入张量
text = "I love how smooth this app runs!"
inputs = tokenizer(text, return_tensors="np", padding=True, truncation=True)

# 查看模型输入名称
for inp in session.get_inputs():
    print(f"Input name: {inp.name}, shape: {inp.shape}, type: {inp.type}")

# 4️⃣ 取输入名称
input_names = [i.name for i in session.get_inputs()]
output_names = [o.name for o in session.get_outputs()]

# 5️⃣ 执行推理
outputs = session.run(output_names, {
    input_names[0]: inputs["input_ids"].astype(np.int64),
    input_names[1]: inputs["attention_mask"].astype(np.int64)
})

# 6️⃣ softmax 得出概率
logits = torch.tensor(outputs[0])
probs = torch.softmax(logits, dim=-1)
labels = ["NEGATIVE", "POSITIVE"]

print({labels[i]: float(probs[0][i]) for i in range(2)})



Input name: input_ids, shape: ['batch_size', 'sequence'], type: tensor(int64)
Input name: attention_mask, shape: ['batch_size', 'sequence'], type: tensor(int64)
{'NEGATIVE': 0.0005933608626946807, 'POSITIVE': 0.9994066953659058}


In [29]:
from huggingface_hub import snapshot_download
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model_name = "AventIQ-AI/whisper-audio-to-text"
model_dir = f"./ms_2/models/a2t_{model_name.replace('/', '_')}_fp32"

if not pexists(model_dir):
    snapshot_download(
        repo_id="AventIQ-AI/whisper-audio-to-text",
        local_dir=model_dir,
        resume_download=True
    )
print(f"{model_name} is saved into {model_dir}")

model = WhisperForConditionalGeneration.from_pretrained(model_dir)
processor = WhisperProcessor.from_pretrained(model_dir)
print(f'Saved model type: {next(model.parameters()).dtype}')


import torch

model_fp16 = model.half()
model_dir_fp16 = f"./ms_2/models/a2t_{model_name.replace('/', '_')}_fp16"
if not pexists(model_dir_fp16):
    model_fp16.save_pretrained(model_dir_fp16)
    processor.save_pretrained(model_dir_fp16)
print(f"{model_name} is saved into {model_dir_fp16}")
print(f'Saved model type: {next(model_fp16.parameters()).dtype}')


from transformers import WhisperForConditionalGeneration, WhisperProcessor

model_int8 = torch.quantization.quantize_dynamic(model, 
                                                 {torch.nn.Linear}, 
                                                 dtype=torch.qint8)

model_dir_int8 = f"./ms_2/models/a2t_{model_name.replace('/', '_')}_int8"
if not pexists(model_dir_int8):
    os.makedirs(model_dir_int8, exist_ok=True)
    torch.save(model_int8.state_dict(), f"{model_dir_int8}/pytorch_model.bin")
    processor.save_pretrained(model_dir_int8)
    model.config.save_pretrained(model_dir_int8)
print(f"{model_name} is saved into {model_dir_int8}")
print(f'Saved model type: {next(model_int8.parameters()).dtype}')


AventIQ-AI/whisper-audio-to-text is saved into ./ms_2/models/a2t_AventIQ-AI_whisper-audio-to-text_fp32
Saved model type: torch.float32
AventIQ-AI/whisper-audio-to-text is saved into ./ms_2/models/a2t_AventIQ-AI_whisper-audio-to-text_fp16
Saved model type: torch.float16
AventIQ-AI/whisper-audio-to-text is saved into ./ms_2/models/a2t_AventIQ-AI_whisper-audio-to-text_int8
Saved model type: torch.float16
