In [None]:
from google.colab import drive
drive.mount('/content/drive')
PATH = '/content/drive/My Drive/Colab_Notebooks/speech_research/切割音檔/音檔1(切割)/segment_0.m4a'
import os
os.chdir('/content/drive/My Drive/Colab_Notebooks/speech_research/切割音檔/音檔1(切割)/')
os.listdir() #確認目錄內容

In [None]:
!pip install transformers
!pip install datasets
!pip install fugashi
!pip install ipadic

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16
Collecting fugashi
  Downloading fugashi-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m6.8 MB/s[0m eta [36m

# 試作版本
# Embedding dim = 64
# Embedding vocab_size = 4000

In [None]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

class BLSTMSpeechScoring(nn.Module):
  def __init__(self, input_size=768, hidden_size=128, num_layers=1, output_size=1, embedding_dim=32, vocab_size=4000):
    super(BLSTMSpeechScoring, self).__init__()

    # 聲學特徵的 BLSTM
    self.acoustic_blstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                    num_layers=num_layers, batch_first=True, bidirectional=True)

    # 語言特徵（字符）的 BLSTM
    self.linguistic_blstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
                     num_layers=num_layers, batch_first=True, bidirectional=True)

    # 字符的嵌入層
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # 處理 BLSTM 輸出的線性層，以匹配維度
    self.acoustic_linear = nn.Linear(hidden_size * 2, hidden_size)
    self.linguistic_linear = nn.Linear(hidden_size * 2, hidden_size)

    # 串接後的最終線性層
    self.final_linear = nn.Linear(hidden_size * 2, output_size)

  def forward(self, acoustic_input, linguistic_input):
    # 聲學輸入通過 BLSTM
    acoustic_output, _ = self.acoustic_blstm(acoustic_input)

    # 將語言輸入嵌入並通過 BLSTM
    embedded_chars = self.embedding(linguistic_input)
    linguistic_output, _ = self.linguistic_blstm(embedded_chars)

    # 線性層確保維度匹配
    acoustic_features = self.acoustic_linear(acoustic_output)
    linguistic_features = self.linguistic_linear(linguistic_output)

    # 對兩輸出進行全局平均池化（GAP）
    gap_acoustic = torch.mean(acoustic_features, dim=1)
    gap_linguistic = torch.mean(linguistic_features, dim=1)

    # 確保在串接之前批量大小相同，怕音檔和文字的數量不對，取完整的
    if gap_acoustic.size(0) != gap_linguistic.size(0):
      min_batch_size = min(gap_acoustic.size(0), gap_linguistic.size(0))
      gap_acoustic = gap_acoustic[:min_batch_size, :]
      gap_linguistic = gap_linguistic[:min_batch_size, :]

    # 串接特徵並最終評分
    concatenated_features = torch.cat((gap_acoustic, gap_linguistic), dim=1)
    concatenated_features = F.relu(concatenated_features)
    score = self.final_linear(concatenated_features)

    return score


# TEST

In [None]:
# 初始化 Tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
model = BLSTMSpeechScoring()

# 範例使用
acoustic_input = torch.randn(2, 1000, 768)  # 聲學特徵的示例張量
text = ["あなたの名前は何ですか", "今日の天気はどうですか"]  # 日語文本樣本
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
linguistic_input = encoded_input['input_ids']  # Tokenizer 的輸出包含 tokens 的索引

score = model(acoustic_input, linguistic_input)
print(score.shape)  # 應該是 [batch_size, output_size]
score

torch.Size([2, 1])


tensor([[0.0193],
        [0.0172]], grad_fn=<AddmmBackward0>)

In [None]:
acoustic_blstm = nn.LSTM(input_size=768, hidden_size=128,
             num_layers=1, batch_first=True, bidirectional=True)

two_tensor_a, tuple_b = acoustic_blstm(acoustic_input)

In [None]:
embedding = nn.Embedding(num_embeddings=4000, embedding_dim=32)

text = "あなたの名前は何ですか"
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
linguistic_input = encoded_input['input_ids']
embedding(linguistic_input).size()

torch.Size([1, 13, 32])

In [None]:
linguistic_input.size()

torch.Size([1, 13])

# Trainer

In [None]:
class Trainer:
  def __init__(self, model, tokenizer, optimizer, loss_fn):
    self.model = model
    self.tokenizer = tokenizer
    self.optimizer = optimizer
    self.loss_fn = loss_fn

  def training_epoch(self, train_loader, batch_size):
    self.model.train()
    total_loss = 0
    for acoustic_input, text, y_batch in train_loader:
      self.optimizer.zero_grad()

      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
      print(outputs)
      loss = self.loss_fn(outputs, y_batch) # 改
      loss.backward(retain_graph=True)

      self.optimizer.step()
      total_loss += loss.item()
    avg_loss = total_loss / len(train_loader) * batch_size # 只適用batch_size = 1
    print(f"Training Loss: {avg_loss}")

  def evaluate(self, eval_loader, batch_size):
    self.model.eval()
    total_loss = 0
    with torch.no_grad():
      for acoustic_input, text, y_batch in eval_loader:
        y_batch = y_batch[0].float()

        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
        linguistic_input = encoded_input['input_ids']
        outputs = self.model(acoustic_input, linguistic_input)

        loss = self.loss_fn(outputs, y_batch)
        total_loss += loss.item()
      avg_loss = total_loss / len(eval_loader) * batch_size
      print(f"Evaluation Loss: {avg_loss}")

  def fit(self, epochs, batch_size, train_loader):
    for epoch in range(epochs):
      print(f"Epoch {epoch+1}/{epochs}:", end=" ")
      self.training_epoch(train_loader, batch_size)

  def pred(self, acoustic_input, text):
    self.model.eval()
    with torch.no_grad():
      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
      print("prediction score: ", outputs) # 加分級


## Hubert只需接收音檔
# BLSTM接收 Hubert output 和 文字

# 怎麼輸入進 Hubert？
## 將音檔先全部輸入進 Hubert得到特徵，再加入文字做成data_loader？

In [None]:
from transformers import AutoProcessor, HubertModel, AutoConfig

processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")

config = AutoConfig.from_pretrained("rinna/japanese-hubert-base", output_hidden_states=True)
hubert = HubertModel.from_pretrained("rinna/japanese-hubert-base", config=config)
# [batch_size, sequence_length, feature]

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at rinna/japanese-hubert-base were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at rinna/japanese-hubert-base and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

# TEST

In [None]:
# 測試輸入
wav_input_16khz = torch.randn(4, 10000)
outputs = hubert(wav_input_16khz)

# skip CNN layer
transformer_hidden_states = outputs.hidden_states[8:]

# Stack transformer hidden states to have a new dimension for layers
stacked_hidden_states = torch.stack(transformer_hidden_states)

# Average across layers dimension (0) while keeping sequence_length
overall_avg_hidden_state = torch.mean(stacked_hidden_states, dim=0)

print("stacked_hidden_states: ", stacked_hidden_states.size()) # [num_layers, batch_size, sequence_length, featrues]
print("overall_avg_hidden_state: ", overall_avg_hidden_state.size()) # [batch_size, sequence_length, featrues]
print("last_hidden_state: ", outputs.last_hidden_state.size()) # [batch_size, sequence_length, featrues]

# 做 dataframe

In [None]:
import torch

text_table = ['～さい','あなた','あのかた','あのひと','いしゃ','エンジニア','おいくつ','かいしゃいん','がくせい','きょうし',
        'ぎんこういん','けんきゅうしゃ','しゃいん','せんせい','だいがく','だれ','でんき','どなた','なんさい','びょういん','みなさん','わたし','わたしたち']

# a2 [1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# b6 [1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0]
score_table = [1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0]
score_table = torch.tensor(score_table)

len(text_table)

23

# (預計) 使用語音辨識產生 text_table

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['audio_path'])

for name in text_table:
  PATH = f'/content/{name}.mp3'
  df = df.append({'audio_path': PATH}, ignore_index=True)

df

  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.append({'audio_path': PATH}, ignore_index=True)
  df = df.appe

Unnamed: 0,audio_path
0,/content/～さい.mp3
1,/content/あなた.mp3
2,/content/あのかた.mp3
3,/content/あのひと.mp3
4,/content/いしゃ.mp3
5,/content/エンジニア.mp3
6,/content/おいくつ.mp3
7,/content/かいしゃいん.mp3
8,/content/がくせい.mp3
9,/content/きょうし.mp3


# 將音檔做處理(採樣率、單聲道)

In [None]:
import torchaudio

def process_waveforms(batch):

  waveform, sample_rate = torchaudio.load(batch['audio_path'])

  if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

  # 如果 waveform 是雙聲道，需要轉單聲道。給 4GE用
  if waveform.size(0) > 1:
    waveform = waveform.mean(dim=0)

  # 讓 waveform的維度正確
  if waveform.ndim > 1:
    waveform = waveform.squeeze()

  batch["speech_array"] = waveform
  batch["sample_rate"] = 16000

  return batch


In [None]:
from datasets import Dataset

df = Dataset.from_pandas(df)
data = df.map(process_waveforms, remove_columns=df.column_names)

data

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Dataset({
    features: ['speech_array', 'sample_rate'],
    num_rows: 23
})

# 送入 Hubert得到特徵

In [None]:
def get_acoustic_feature(batch):
  with torch.no_grad():
    processed_audios = processor(batch['speech_array'],
                    sampling_rate=16000,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=160000)
    outputs = hubert(**processed_audios)

  # all layers
  transformer_hidden_states = outputs.hidden_states[:]

  # Stack transformer hidden states to have a new dimension for layers
  stacked_hidden_states = torch.stack(transformer_hidden_states)

  # Average across layers dimension (0) while keeping sequence_length
  overall_avg_hidden_state = torch.mean(stacked_hidden_states, dim=0)

  return overall_avg_hidden_state # [batch_size, sequence_length, featrues]

acoustic_feature = get_acoustic_feature(data)
# acoustic_feature

In [None]:
type(acoustic_feature)

torch.Tensor

In [None]:
acoustic_feature.size()

torch.Size([23, 47, 768])

# 加入文字並做成 dataset

# 動態做 text_table

In [None]:
# text_table -> list
# len(text_table) == len(acoustic_feature)

# 假設檔案名: 音檔\切割音檔\A_class\A_class_audio_1\segment1

# for class_name in ['A', 'B', 'C']:
#   for person in range(10):
#     for index in range(23):
#       PATH = f'音檔\切割音檔\{class_name}_class\{class_name}_class_audio_{person+1}\segment{index+1}'
#       text = whisper(PATH)
#       text_table.append(text)

# 加分數進去

# 看哪個音檔 NaN -> drop


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

batch_size = 3

def dynamic_padding(batch):
  acoustic_feature, text, score_label = zip(*batch)

  padded_feature = pad_sequence(acoustic_feature, batch_first=True, padding_value=0.0)
  score_label = torch.tensor(score_label, dtype=torch.float)
  score_label = score_label.unsqueeze(-1)
  return padded_feature, text, score_label

def make_dataloader(acoustic_feature, text, score, batch_size, collate_fn=dynamic_padding):
  dataset = list(zip(acoustic_feature, text, score))
  loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
  return loader

train_loader = make_dataloader(acoustic_feature=acoustic_feature, text=text_table, score=score_table, batch_size=batch_size)


In [None]:
type(acoustic_feature)

torch.Tensor

# 放進 BLSTM

In [None]:
# 初始化 Tokenizer 和模型

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
blstm = BLSTMSpeechScoring()
loss_fn = nn.MSELoss()

In [None]:
from torch import optim

optimizer = optim.Adam(blstm.parameters(), lr=5e-4)

trainer = Trainer(blstm, tokenizer, optimizer, loss_fn)

In [None]:
trainer.fit(epochs=3, batch_size=batch_size, train_loader=train_loader)

Epoch 1/3: tensor([[0.7508],
        [0.7279],
        [0.7976]], grad_fn=<AddmmBackward0>)
tensor([[0.8131],
        [0.5439],
        [0.6089]], grad_fn=<AddmmBackward0>)
tensor([[0.6184],
        [0.4543],
        [0.4225]], grad_fn=<AddmmBackward0>)
tensor([[0.7018],
        [0.7533],
        [0.7623]], grad_fn=<AddmmBackward0>)
tensor([[0.5804],
        [0.4874],
        [0.5956]], grad_fn=<AddmmBackward0>)
tensor([[0.3202],
        [0.4351],
        [0.6467]], grad_fn=<AddmmBackward0>)
tensor([[0.6287],
        [0.5186],
        [0.4512]], grad_fn=<AddmmBackward0>)
tensor([[0.6747],
        [0.6162]], grad_fn=<AddmmBackward0>)
Training Loss: 0.6444899677298963
Epoch 2/3: tensor([[0.6545],
        [0.5611],
        [0.3422]], grad_fn=<AddmmBackward0>)
tensor([[0.6176],
        [0.6309],
        [0.3470]], grad_fn=<AddmmBackward0>)
tensor([[0.5048],
        [0.4627],
        [0.3505]], grad_fn=<AddmmBackward0>)
tensor([[0.5838],
        [0.2720],
        [0.4310]], grad_fn=<AddmmBa

In [None]:
acoustic_input, text, y_batch = next(iter(train_loader))

In [None]:
trainer.pred(acoustic_input, text) # 在做loss計算之前預測的分數都一樣，練太少或是長度太短？

# 100 -> 優
# .80 -> 佳
# .60 -> 可
# .40 -> 尚可
# .20 -> 差
# ..0 -> 劣

prediction score:  tensor([[0.6052],
        [0.5445],
        [0.5864]])


In [None]:
text, y_batch # 分數為零時，預測分數小於1。分數為一時，預測分數大於1。

(('おいくつ', 'けんきゅうしゃ', 'がくせい'),
 tensor([[1.],
         [1.],
         [0.]]))

In [None]:
y_batch

tensor([[1.],
        [1.],
        [0.]])