In [1]:
!pip install transformers torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [10]:
!ls -a /root/.cache/huggingface/hub

.  ..  .locks  models--facebook--wav2vec2-large-960h  version.txt


In [7]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
#音频向量化，（使用CTC方法（Connectionist：神经网络（连接主义） Temporal（时间） Classification（分类问题）），适合没有明确对齐的任务）
import torch
import torchaudio #torch在音频处理上的拓展

# 加载预训练的Wav2Vec 2.0模型
#在colab中的路径是/root/.cache/huggingface/hub
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")#自动识别为HuggingFace url，进行连接下载
#Processor是将音频源数据转化为适合输入模型的数据（重采样（升频率采样需要通过近似计算方法补充采样点、降频率采样需要减少原笨的采样点）、归一化、转化为张量等）
#在输出阶段还可以把index decode成对应的字符
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")#自动连接huggingface服务器进行封装度高的下载
#CTC是输入音频信号，得到对应时间步的字符分数logits

# 加载音频文件
waveform, sample_rate = torchaudio.load("54.mp3")#可以是mp3、wave或其他类型的文件
#waveform是一个多维的张量，表示音频信号在时间上的变化。它的形状通常是 (channels, samples)，（声道数，每个声道上的总采样数）
#sample_rate是采样频率，一秒采样多少次
#一开始采样的音频就是时间-幅度图，通过傅里叶变换转化为频谱图（频率-振幅图是频域图）
#假设你正在分析一段音频，可能会得到如下的频谱图：
# 横轴：是音频信号在时间上的位置。
# 纵轴：是信号的频率成分。
# 颜色深度：代表在特定时间和频率点的信号强度。
# 如果采样率不是16000，进行重采样
if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    #Resample是一个类的实例化对象，由于实现了__call__方法，所以可以像调用函数一样传入waveform进行调整
    sample_rate = 16000



Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(waveform)
print(waveform.size())

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0017, 0.0019, 0.0008]])
torch.Size([1, 4686995])


In [8]:
waveform = waveform.mean(dim=0, keepdim=True)

In [4]:
# 处理音频数据
inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)#inputs最后结果是一个字典
#return tensor形式为pytorch

In [15]:
print(inputs.input_values.squeeze())

tensor([0.0016, 0.0016, 0.0016,  ..., 0.0106, 0.0119, 0.0057])


In [17]:
# 识别音频
with torch.no_grad():
    #logits是softmax函数输出的分数值，用来转化为概率值
    logits = model(input_values=inputs.input_values.squeeze(0)).logits
    print("Logits (model output):", logits)

Logits (model output): tensor([[[ 17.5445, -36.8326, -36.5754,  ...,  -5.3231,  -3.9398,  -5.0937],
         [ 16.8957, -35.8509, -35.6496,  ...,  -5.1860,  -3.6696,  -4.8812],
         [ 16.1511, -34.7046, -34.5286,  ...,  -4.9961,  -3.4437,  -4.8931],
         ...,
         [ 18.6834, -38.1886, -37.8124,  ...,  -5.5980,  -4.1911,  -5.1241],
         [ 18.6549, -38.0649, -37.6845,  ...,  -5.5407,  -4.1437,  -5.0975],
         [ 17.4678, -36.7492, -36.4677,  ...,  -5.1098,  -3.9297,  -4.8781]]])


In [18]:
print(logits.size())

torch.Size([1, 14646, 32])


In [20]:

# 获取预测的索引
predicted_ids = torch.argmax(logits, dim=-1)

# 解码为文本
transcription = processor.decode(predicted_ids[0])
print(f"Transcription: {transcription}")



Transcription: HALLO AND WET ENTO THIS RECOLLING WOT TO YOU BY THE BRITISH COUNCIL A BEACH BY JOHN RUSSEL IMAGINE A BEACH A QUIET PLACE WITH ONLY THE NOISE OF THE SEA AND THE GULLS IN THE BACKGROUND THERE ARE BOATS FLOATING NEAR THE SHORE AND A FEW PEOPLE SWIMMING IN THE WATER NEX TO THEM IT'S A HOT DAY AND THERE ARE SOME PEOPLE LYING ON THE SAND ENJOYING THE SUNSHINE AND SLOWLY GOING BROUND THERE ARE NO SHOPS NO PEOPLE MAKING NOISES KNOW LOUD MUSIC EVERYTHING IS PEACEFUL THERE IS JUST THE SEA THE SUN AND THE BEACH A LITTLE PARADISE WERE IS IT THE BEACH IS ON THE SOUTH COAST OF SCOTLAND NEAR A LITTLE TOWN CALLED GATE HOUSE OF FLEET IN THE COUNTY OF DUMFRICE AND GALLAWAY TWENTY TWO YEARS AGO MY FAMILY AND I FOUND THIS PLACE FOR THE FIRST TIME AND WE HAVE NEVER REALLY LEFT IT EVERY YEAR IN THE SUMMER WHILE OTHER PEOPLE GO ON HOLIDAY TO FOREIGN COUNDTRIES AND EXOTIC PLACES WE GO TO OUR PRIVATE PARADICE AND RELAX HERE IS A LITTLE CAMPSIHT WITH TENTS AND CARAVANS NEXT TO THE BEACH AND THIS 

In [23]:
print(predicted_ids[0][:500])
print(predicted_ids.size())

tensor([ 0,  0,  0,  0,  0,  0,  0,  0, 11,  0,  7,  0, 15,  0,  0,  0, 15,  0,
         8,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  0,  9,  0, 14,
         0,  4,  4, 18, 18,  5,  0,  0,  0,  0,  0,  6,  4,  0,  0,  5,  0,  9,
         0,  0,  0,  6,  0,  8,  0,  4,  6, 11, 10,  0,  0, 12,  0,  4,  0, 13,
         5,  0,  0,  0,  0, 19,  0,  8,  0,  0, 15,  0,  0, 15,  0,  0, 10,  9,
         0, 21,  0,  0,  4,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, 18,  0,  0,  8,  0,  0,  0,  0,  0,  6,  0,  4,  4,  6,  8,
         0,  0,  4,  4,  0, 22,  0,  8, 16,  0,  0,  0,  0,  0,  4,  4,  0, 24,
         0, 22,  0,  0,  4,  4,  6, 11,  5,  4,  4, 24, 24, 13, 10,  0,  0,  6,
         6, 10,  0, 12, 12, 11,  0,  4,  0,  0, 19, 19,  8,  8, 16,  9,  9,  0,
         0,  0,  0, 19,  0, 10,  0, 15,  0,  0,  0,  0,  4,  4,  4,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [19]:
# 输出处理后的音频数据
print("Processed Inputs:", inputs)
print(inputs.size())

Processed Inputs: {'input_values': tensor([[[0.0016, 0.0016, 0.0016,  ..., 0.0106, 0.0119, 0.0057],
         [0.0016, 0.0016, 0.0016,  ..., 0.0106, 0.0119, 0.0057]]])}


In [14]:
print("Input values (after processing):", inputs.input_values)
print(inputs.input_values.size())

Input values (after processing): tensor([[[0.0016, 0.0016, 0.0016,  ..., 0.0106, 0.0119, 0.0057]]])
torch.Size([1, 1, 4686995])
