Huggingface需要手动将模型和参数放到GPU上
```python
device = "cuda:0" if torch.cuda.is_available() else "cpu"
sentence  = 'Hello World!'
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
model     = BertModel.from_pretrained('bert-large-uncased')

inputs    = tokenizer(sentence, return_tensors="pt").to(device)
model     = model.to(device)
outputs   = model(**inputs)
```

In [1]:
import ffmpeg

def extractAudio(input_file, output_file):
    stream = ffmpeg.input(input_file)
    audio = stream.audio
    audio = ffmpeg.output(audio, output_file)
    ffmpeg.run(audio)

In [2]:
extractAudio("./data/bda-030.mp4","./data/bda-030.mp3")

In [4]:
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

In [2]:
import os
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor

# 读取音频文件
audio_file = AudioSegment.from_file("./data/bda-030.mp3", format="mp3")

def split_audio(start_time, end_time, segment_number):
    # 提取部分片段
    audio_segment = audio_file[start_time:end_time]

    # 将采样率设置为目标采样率
    target_sample_rate = 16000
    audio_segment = audio_segment.set_frame_rate(target_sample_rate)

    # 将片段转换为wav格式并保存到./data中
    output_file = os.path.join("./data", f"temp{segment_number}.wav")
    audio_segment.export(output_file, format="wav")

# 计算音频文件的总时长（毫秒）
audio_duration_ms = len(audio_file)
segment_duration = 30 * 1000  # 30秒

# 创建一个包含时间范围的列表
time_ranges = [(i * segment_duration, (i + 1) * segment_duration) for i in range(audio_duration_ms // segment_duration + 1)]

# 创建线程池并执行任务
with ThreadPoolExecutor() as executor:
    for i, (start, end) in enumerate(time_ranges):
        executor.submit(split_audio, start, end, i+1)

In [5]:
# 加载Wav2Vec2模型和处理器
model = AutoModelForSpeechSeq2Seq.from_pretrained("vumichien/whisper-large-v2-jp")
processor = AutoProcessor.from_pretrained("vumichien/whisper-large-v2-jp")

In [3]:
import torch
from glob import glob
import os
import re

In [6]:
# 使用torchaudio读取wav文件
waveform, sample_rate = torchaudio.load("./data/audio/temp1.wav")
waveform[0].numpy().shape

(480000,)

```python
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# 通过处理器处理音频
inputs = processor(waveform[0].numpy(), sampling_rate=sample_rate, return_tensors="pt").to(device)
model.to(device)
# 获取输入特征
input_features = inputs.input_features
# 生成ID
generated_ids = model.generate(input_features)
# 转录
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)
```

In [7]:
# 初始化变量
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tru

In [15]:
# 设定输入输出目录
input_directory = "./data/audio"

index = 4
step = 10
# 获取所有temp*.wav文件
wav_files = sorted(glob(os.path.join(input_directory, "temp*.wav")))[(index - 1) * step:index * step]

transcriptions = []
count = 1 + (index - 1) * step
for file in wav_files:
    # 使用torchaudio读取wav文件
    waveform, sample_rate = torchaudio.load(file)
    inputs = processor(waveform[0].numpy(), sampling_rate=sample_rate, return_tensors="pt").to(device)

    # 生成ID
    generated_ids = model.generate(inputs.input_features)

    # 转录
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("processing audio ",count)
    count += 1
    transcriptions.append(transcription)

# 分句函数
def split_sentences(text):
    return re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)

#with open("./subtitle"+str(index)+".srt", "w") as f:
#    for transcription in transcriptions:
#        f.write(transcription+"\n")

# 将转录结果写入srt字幕文件
with open("./subtitle"+str(index)+".srt", "w") as f:
    srt_counter = 1
    for i, transcription in enumerate(transcriptions):
        sentences = split_sentences(transcription)
        sentence_count = len(sentences)
        segment_duration = 30  # 30秒
        sentence_duration = segment_duration / sentence_count

        for j, sentence in enumerate(sentences):
            start_time = i * segment_duration + j * sentence_duration + (index - 1) * 30 * (step)
            end_time = start_time + sentence_duration

            # 格式化时间
            start_time_formatted = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time%1)*1000):03d}"
            end_time_formatted = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time%1)*1000):03d}"

            # 写入字幕
            f.write(f"{srt_counter}\n")
            f.write(f"{start_time_formatted} --> {end_time_formatted}\n")
            f.write(f"{sentence}\n\n")
            srt_counter += 1



processing audio  31
processing audio  32
processing audio  33
processing audio  34
processing audio  35
processing audio  36
processing audio  37
processing audio  38
processing audio  39
processing audio  40
