In [None]:
from google.colab import drive
drive.mount('/content/drive')
PATH = '/content/drive/My Drive/Colab_Notebooks/speech_research/切割音檔/音檔1(切割)/segment_0.m4a'
import os
os.chdir('/content/drive/My Drive/Colab_Notebooks/speech_research/切割音檔/音檔1(切割)/')
os.listdir() #確認目錄內容

In [1]:
!pip install transformers
!pip install datasets
!pip install fugashi
!pip install ipadic

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16
Collecting fugashi
  Downloading fugashi-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m2.3 MB/s[0m eta [36m

# 試作版本
# Embedding dim = 64
# Embedding vocab_size = 4000

In [2]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

class BLSTMSpeechScoring(nn.Module):
  def __init__(self, input_size=768, hidden_size=256, num_layers=1, output_size=1, embedding_dim=64, vocab_size=4000):
    super(BLSTMSpeechScoring, self).__init__()

    # 聲學特徵的 BLSTM
    self.acoustic_blstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                    num_layers=num_layers, batch_first=True, bidirectional=True)

    # 語言特徵（字符）的 BLSTM
    self.linguistic_blstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
                     num_layers=num_layers, batch_first=True, bidirectional=True)

    # 字符的嵌入層
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # 處理 BLSTM 輸出的線性層，以匹配維度
    self.acoustic_linear = nn.Linear(hidden_size * 2, output_size)
    self.linguistic_linear = nn.Linear(hidden_size * 2, output_size)

    # 串接後的最終線性層
    self.final_linear = nn.Linear(output_size * 2, output_size)

  def forward(self, acoustic_input, linguistic_input):
    # 聲學輸入通過 BLSTM
    acoustic_output, _ = self.acoustic_blstm(acoustic_input)

    # 將語言輸入嵌入並通過 BLSTM
    embedded_chars = self.embedding(linguistic_input)
    linguistic_output, _ = self.linguistic_blstm(embedded_chars)

    # 對兩輸出進行全局平均池化（GAP）
    gap_acoustic = torch.mean(acoustic_output, dim=1)
    gap_linguistic = torch.mean(linguistic_output, dim=1)

    # 線性層確保維度匹配
    acoustic_features = self.acoustic_linear(gap_acoustic)
    linguistic_features = self.linguistic_linear(gap_linguistic)

    # 確保在串接之前批量大小相同，怕音檔和文字的數量不對，取完整的
    if acoustic_features.size(0) != linguistic_features.size(0):
      min_batch_size = min(acoustic_features.size(0), linguistic_features.size(0))
      acoustic_features = acoustic_features[:min_batch_size, :]
      linguistic_features = linguistic_features[:min_batch_size, :]

    # 串接特徵並最終評分
    concatenated_features = torch.cat((acoustic_features, linguistic_features), dim=1)
    concatenated_features = F.relu(concatenated_features)
    score = self.final_linear(concatenated_features)

    return score


# TEST

In [161]:
# 初始化 Tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
model = BLSTMSpeechScoring()

# 範例使用
acoustic_input = torch.randn(2, 1000, 768)  # 聲學特徵的示例張量
text = ["あなたの名前は何ですか", "今日の天気はどうですか"]  # 日語文本樣本
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
linguistic_input = encoded_input['input_ids']  # Tokenizer 的輸出包含 tokens 的索引

score = model(acoustic_input, linguistic_input)
print(score.shape)  # 應該是 [batch_size, output_size]
score

torch.Size([2, 1])


tensor([[0.4516],
        [0.4380]], grad_fn=<AddmmBackward0>)

# Trainer

In [3]:
class Trainer:
  def __init__(self, model, tokenizer, optimizer, loss_fn):
    self.model = model
    self.tokenizer = tokenizer
    self.optimizer = optimizer
    self.loss_fn = loss_fn

  def training_epoch(self, train_loader, batch_size):
    self.model.train()
    total_loss = 0
    for acoustic_input, text, y_batch in train_loader:
      self.optimizer.zero_grad()

      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
      print(outputs)
      loss = self.loss_fn(outputs, y_batch) # 改
      loss.backward(retain_graph=True)

      self.optimizer.step()
      total_loss += loss.item()
    avg_loss = total_loss / len(train_loader) * batch_size# 只適用batch_size = 1
    print(f"Training Loss: {avg_loss}")

  def evaluate(self, eval_loader, batch_size):
    self.model.eval()
    total_loss = 0
    with torch.no_grad():
      for acoustic_input, text, y_batch in eval_loader:
        y_batch = y_batch[0].float()

        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensor="pt", max_length=100)
        linguistic_input = encoded_input['input_ids']
        outputs = self.model(acoustic_input, linguistic_input)

        loss = self.loss_fn(outputs, y_batch)
        total_loss += loss.item()
      avg_loss = total_loss / len(eval_loader) * batch_size
      print(f"Evaluation Loss: {avg_loss}")

  def fit(self, epochs, batch_size, train_loader):
    for epoch in range(epochs):
      print(f"Epoch {epoch+1}/{epochs}:", end=" ")
      self.training_epoch(train_loader, batch_size)

  def pred(self, acoustic_input, text):
    self.model.eval()
    with torch.no_grad():
      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
      print("prediction score: ", outputs) # 加分級


## Hubert只需接收音檔
# BLSTM接收 Hubert output 和 文字

# 怎麼輸入進 Hubert？
## 將音檔先全部輸入進 Hubert得到特徵，再加入文字做成data_loader？

In [4]:
from transformers import AutoProcessor, HubertModel, AutoConfig

processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
config = AutoConfig.from_pretrained("rinna/japanese-hubert-base", output_hidden_states=True)
hubert = HubertModel.from_pretrained("rinna/japanese-hubert-base", config=config)
# [batch_size, sequence_length, feature]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at rinna/japanese-hubert-base were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at rinna/japanese-hubert-base and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

# TEST

In [None]:
# 測試輸入
wav_input_16khz = torch.randn(4, 10000)
outputs = hubert(wav_input_16khz)

# skip CNN layer
transformer_hidden_states = outputs.hidden_states[8:]

# Stack transformer hidden states to have a new dimension for layers
stacked_hidden_states = torch.stack(transformer_hidden_states)

# Average across layers dimension (0) while keeping sequence_length
overall_avg_hidden_state = torch.mean(stacked_hidden_states, dim=0)

print("stacked_hidden_states: ", stacked_hidden_states.size()) # [num_layers, batch_size, sequence_length, featrues]
print("overall_avg_hidden_state: ", overall_avg_hidden_state.size()) # [batch_size, sequence_length, featrues]
print("last_hidden_state: ", outputs.last_hidden_state.size()) # [batch_size, sequence_length, featrues]

# 做 dataframe

In [5]:
text_table = ['～さい','あなた','あのかた','あのひと','いしゃ','エンジニア','おいくつ','かいしゃいん','がくせい','きょうし','ぎんこういん','けんきゅうしゃ','しゃいん','せんせい','だいがく','だれ','でんき','どなた','なんさい','びょういん','みなさん','わたし','わたしたち']

score_table = [1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
score_table = torch.tensor(score_table)

len(text_table)

23

# (預計) 使用語音辨識產生 text_table

In [6]:
import pandas as pd

df = pd.DataFrame(columns=['audio_path'])

for name in text_table:
  PATH = f'/content/{name}.mp3'
  df = df.append({'audio_path': PATH}, ignore_index=True)

df

'''
text_table = {}
for index in range(23):
  PATH = f'/content/切割音檔/A班/segment{index}
  df = df.append({'audio_path': PATH}, ignore_index=True)

  text = 語音辨識(PATH)
  text_table = text_table.append(text)
'''

  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.appe

"\ntext_table = {}\nfor index in range(23):\n  PATH = f'/content/切割音檔/A班/segment{index}\n  df = df.append({'audio_path': PATH}, ignore_index=True)\n\n  text = 語音辨識(PATH)\n  text_table = text_table.append(text)\n"

# 將音檔做處理(採樣率、單聲道)

In [7]:
import torchaudio

def process_waveforms(batch):

  waveform, sample_rate = torchaudio.load(batch['audio_path'])

  if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

  # 如果 waveform 是雙聲道，需要轉單聲道。給 4GE用
  if waveform.size(0) > 1:
    waveform = waveform.mean(dim=0)

  # 讓 waveform的維度正確
  if waveform.ndim > 1:
    waveform = waveform.squeeze()

  batch["speech_array"] = waveform
  batch["sample_rate"] = 16000

  return batch


In [8]:
from datasets import Dataset

df = Dataset.from_pandas(df)
data = df.map(process_waveforms, remove_columns=df.column_names)

data

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Dataset({
    features: ['speech_array', 'sample_rate'],
    num_rows: 23
})

# 送入 Hubert得到特徵

In [9]:
def get_acoustic_feature(batch):
  processed_audios = processor(batch['speech_array'],
                  sampling_rate=16000,
                  return_tensors="pt",
                  padding=True,
                  truncation=True,
                  max_length=160000)
  outputs = hubert(**processed_audios)

  # skip CNN layers
  transformer_hidden_states = outputs.hidden_states[8:]

  # Stack transformer hidden states to have a new dimension for layers
  stacked_hidden_states = torch.stack(transformer_hidden_states)

  # Average across layers dimension (0) while keeping sequence_length
  overall_avg_hidden_state = torch.mean(stacked_hidden_states, dim=0)

  return overall_avg_hidden_state # [batch_size, sequence_length, featrues]

acoustic_feature = get_acoustic_feature(data)
# acoustic_feature

In [None]:
acoustic_feature.size()

torch.Size([23, 52, 768])

# 加入文字並做成 dataset

In [10]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

batch_size = 2

def dynamic_padding(batch):
  acoustic_feature, text, score_label = zip(*batch)

  padded_feature = pad_sequence(acoustic_feature, batch_first=True, padding_value=0.0)
  score_label = torch.tensor(score_label, dtype=torch.float)
  score_label = score_label.unsqueeze(-1)
  return padded_feature, text, score_label

def make_dataloader(acoustic_feature, text, score, batch_size, collate_fn=dynamic_padding):
  dataset = list(zip(acoustic_feature, text, score))
  loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
  return loader

train_loader = make_dataloader(acoustic_feature=acoustic_feature, text=text_table, score=score_table, batch_size=batch_size)


# 放進 BLSTM

In [11]:
from torch import optim
import torch.nn as nn

# 初始化 Tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
blstm = BLSTMSpeechScoring()
optimizer = optim.Adam(blstm.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

tokenizer_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

In [12]:
trainer = Trainer(blstm, tokenizer, optimizer, loss_fn)

In [13]:
trainer.fit(epochs=2, batch_size=batch_size, train_loader=train_loader)

Epoch 1/2: tensor([[-0.2749],
        [-0.2797]], grad_fn=<AddmmBackward0>)
tensor([[-0.2553],
        [-0.2315]], grad_fn=<AddmmBackward0>)
tensor([[-0.2234],
        [-0.2163]], grad_fn=<AddmmBackward0>)
tensor([[-0.2027],
        [-0.1521]], grad_fn=<AddmmBackward0>)
tensor([[-0.1613],
        [-0.1701]], grad_fn=<AddmmBackward0>)
tensor([[-0.1083],
        [-0.1753]], grad_fn=<AddmmBackward0>)
tensor([[-0.0855],
        [-0.0300]], grad_fn=<AddmmBackward0>)
tensor([[-0.0518],
        [-0.0624]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0035],
        [-0.1191]], grad_fn=<AddmmBackward0>)
tensor([[0.0875],
        [0.0105]], grad_fn=<AddmmBackward0>)
tensor([[ 0.1451],
        [-0.0262]], grad_fn=<AddmmBackward0>)
tensor([[-0.0109]], grad_fn=<AddmmBackward0>)
Training Loss: 2.2598682244618735
Epoch 2/2: tensor([[0.1214],
        [0.1511]], grad_fn=<AddmmBackward0>)
tensor([[0.0742],
        [0.3444]], grad_fn=<AddmmBackward0>)
tensor([[0.5122],
        [0.2986]], grad_fn=<AddmmBackward

In [33]:
acoustic_input, text, y_batch = next(iter(train_loader))

In [34]:
trainer.pred(acoustic_input, text) # 在做loss計算之前預測的分數都一樣，練太少或是長度太短？

# 100 -> 優
# .80 -> 佳
# .60 -> 可
# .40 -> 尚可
# .20 -> 差
# ..0 -> 劣

prediction score:  tensor([[0.9638],
        [1.1769]])


In [35]:
text, y_batch # 分數為零時，預測分數小於1。分數為一時，預測分數大於1。

(('あのかた', 'きょうし'),
 tensor([[0.],
         [1.]]))