In [None]:
!pip install transformers
!pip install datasets
!pip install fugashi
!pip install ipadic

!pip install flask-ngrok
!pip install flask
!pip install pyngrok

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16
Collecting fugashi
  Downloading fugashi-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m4.8 MB/s[0m eta [36m

# Load Scoring Module

In [None]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

class BLSTMSpeechScoring(nn.Module):
  def __init__(self, input_size=768, hidden_size=128, num_layers=1, output_size=1, embedding_dim=64, vocab_size=4000):
    super(BLSTMSpeechScoring, self).__init__()

    # 聲學特徵的 BLSTM
    self.acoustic_blstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                    num_layers=num_layers, batch_first=True, bidirectional=True)

    # 語言特徵（字符）的 BLSTM
    self.linguistic_blstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
                     num_layers=num_layers, batch_first=True, bidirectional=True)

    # 字符的嵌入層
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # 處理 BLSTM 輸出的線性層，以匹配維度
    self.acoustic_linear = nn.Linear(hidden_size * 2, hidden_size)
    self.linguistic_linear = nn.Linear(hidden_size * 2, hidden_size)

    # 串接後的最終線性層
    self.final_linear = nn.Linear(hidden_size * 2, output_size)

  def forward(self, acoustic_input, linguistic_input):
    # 聲學輸入通過 BLSTM
    acoustic_output, _ = self.acoustic_blstm(acoustic_input)

    # 將語言輸入嵌入並通過 BLSTM
    embedded_chars = self.embedding(linguistic_input)
    linguistic_output, _ = self.linguistic_blstm(embedded_chars)

    # 線性層確保維度匹配
    acoustic_features = self.acoustic_linear(acoustic_output)
    linguistic_features = self.linguistic_linear(linguistic_output)

    # 對兩輸出進行全局平均池化（GAP）
    gap_acoustic = torch.mean(acoustic_features, dim=1)
    gap_linguistic = torch.mean(linguistic_features, dim=1)

    # 確保在串接之前批量大小相同，怕音檔和文字的數量不對，取完整的
    if gap_acoustic.size(0) != gap_linguistic.size(0):
      min_batch_size = min(gap_acoustic.size(0), gap_linguistic.size(0))
      gap_acoustic = gap_acoustic[:min_batch_size, :]
      gap_linguistic = gap_linguistic[:min_batch_size, :]

    # 串接特徵並最終評分
    concatenated_features = torch.cat((gap_acoustic, gap_linguistic), dim=1)
    concatenated_features = F.relu(concatenated_features)
    score = self.final_linear(concatenated_features)

    return score


# Trainer

In [None]:
from sklearn.metrics import confusion_matrix

jugde = 0.65

class Trainer:
  def __init__(self, model, tokenizer, optimizer, loss_fn):
    self.model = model
    self.tokenizer = tokenizer
    self.optimizer = optimizer
    self.loss_fn = loss_fn

  def training_epoch(self, train_loader):
    self.model.train()
    total_loss = 0
    for acoustic_input, text, y_batch in train_loader:
      self.optimizer.zero_grad()

      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
      # print(encoded_input['input_ids'].size())
      # print(encoded_input['input_ids'])
      loss = self.loss_fn(outputs, y_batch) # 改
      loss.backward()

      self.optimizer.step()
      total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Training Loss: {avg_loss}")

  def evaluate(self, eval_loader):
    self.model.eval()
    total_loss = 0
    with torch.no_grad():
      for acoustic_input, text, y_batch in eval_loader:
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
        linguistic_input = encoded_input['input_ids']
        outputs = self.model(acoustic_input, linguistic_input)

        loss = self.loss_fn(outputs, y_batch)
        total_loss += loss.item()
      avg_loss = total_loss / len(eval_loader)
      print(f"Evaluation Loss: {avg_loss}")

  def fit(self, epochs, train_loader):
    for epoch in range(epochs):
      print(f"Epoch {epoch+1}/{epochs}:", end=" ")
      self.training_epoch(train_loader)

  def pred(self, acoustic_input, text):
    self.model.eval()
    with torch.no_grad():
      encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=100)
      linguistic_input = encoded_input['input_ids']
      outputs = self.model(acoustic_input, linguistic_input)
    return outputs

  def confusion_matrix(self, eval_loader):
    self.model.eval()
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
      for acoustic_input, text, y_batch in eval_loader:
        outputs = self.pred(acoustic_input, text)
        predicted = torch.where(outputs > jugde, torch.tensor(1.0), torch.tensor(0.0))
        all_predictions.extend(predicted.view(-1).tolist())
        all_true_labels.extend(y_batch.view(-1).tolist())

    cm = confusion_matrix(all_true_labels, all_predictions)
    print("Confusion Matrix:\n", cm)

    tn, fp, fn, tp = cm.ravel()
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}", end=f"\n----------------\n")

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (fp + tn) if (fp + tn) > 0 else 0
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Specificity: {specificity:.2f}")

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(f"Accuracy: {accuracy:.2f}")

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"F1 Score: {f1:.2f}")
    return precision, recall, specificity, accuracy, f1

In [None]:
from transformers import AutoProcessor, HubertModel, AutoConfig

processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")

config = AutoConfig.from_pretrained("rinna/japanese-hubert-base", output_hidden_states=True)
hubert = HubertModel.from_pretrained("rinna/japanese-hubert-base", config=config)
# [batch_size, sequence_length, feature]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at rinna/japanese-hubert-base were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at rinna/japanese-hubert-base and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

# (預計) 使用語音辨識產生 text_table

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

whisperProcessor = WhisperProcessor.from_pretrained("jakeyoo/whisper-medium-ja")
model = WhisperForConditionalGeneration.from_pretrained("jakeyoo/whisper-medium-ja")
model.config.forced_decoder_ids = None


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
import pandas as pd
import librosa
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

def speech_file_to_array_fn(batch):
  # 使用 librosa 載入音頻檔案，並將其轉換為陣列
  speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
  batch["array"] = speech_array
  return batch

def whisper(audio_paths):
  # 建立一個包含音頻路徑的 DataFrame
  test_dataset = {"path":[audio_paths]}
  test_dataset = pd.DataFrame(test_dataset)

  # 將 DataFrame 轉換為 Dataset 對象
  test_dataset = Dataset(pa.Table.from_pandas(test_dataset))

  # 將每個音頻檔案轉換為陣列
  test_dataset = test_dataset.map(speech_file_to_array_fn)

  # 處理音頻數據以獲取模型的輸入特徵
  input_features = whisperProcessor(test_dataset['array'], sampling_rate=16_000, return_tensors="pt").input_features

  # 使用模型生成預測結果，並關閉梯度計算以加快速度
  with torch.no_grad():
    predicted_ids = model.generate(input_features)

  # 解碼預測結果以獲得文字轉寫
  transcription = whisperProcessor.batch_decode(predicted_ids, skip_special_tokens=True)

  # 處理轉寫結果以去除不需要的部分
  text = []
  for index in range(len(transcription)):
    transcription_len = len(transcription[index])
    text.append(transcription[index][0:transcription_len-1])

  # 返回處理後的文字結果
  return text[0]  # 返回文本


# 做 dataframe

# 有 dataframe => process_waveforms(batch) => get_acoustic_feature(batch)
# 或是 dataframe => make_dataloader(df)

In [None]:
def make_dataframe(audio_path):
  row = []
  text = whisper(audio_path)
  # text = 'さい'
  print(text)
  row.append({'audio_path': audio_path, 'text': text})
  df = pd.DataFrame(row)
  return df

# 將音檔做處理(採樣率、單聲道)

In [None]:
import torchaudio

def process_waveforms(batch):

  waveform, sample_rate = torchaudio.load(batch['audio_path'])

  if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

  # 如果 waveform 是雙聲道，需要轉單聲道。給 4GE用
  if waveform.size(0) > 1:
    waveform = waveform.mean(dim=0)

  # 讓 waveform的維度正確
  if waveform.ndim > 1:
    waveform = waveform.squeeze()

  batch["speech_array"] = waveform

  return batch


# 送入 Hubert得到特徵

In [None]:
def get_acoustic_feature(batch):
  with torch.no_grad():
    processed_audios = processor(batch['speech_array'],
                    sampling_rate=16000,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=160000)
    outputs = hubert(**processed_audios)

  # all layers
  transformer_hidden_states = outputs.hidden_states[:]

  # Stack transformer hidden states to have a new dimension for layers
  stacked_hidden_states = torch.stack(transformer_hidden_states)

  # Average across layers dimension (0) while keeping sequence_length
  overall_avg_hidden_state = torch.mean(stacked_hidden_states, dim=0)

  return overall_avg_hidden_state # [batch_size, sequence_length, featrues]


# 放進 BLSTM

In [None]:
# 初始化 Tokenizer 和模型

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
blstm = BLSTMSpeechScoring()
loss_fn = nn.MSELoss()


In [None]:
# 下載模型

model_save_path = '/content/BLSTMSpeechScoring.pth'
blstm.load_state_dict(torch.load(model_save_path))

blstm.eval()


# jugde = 0.65
# train : test = 6 : 4

# Confusion Matrix:
#  [[ 13  17]
#  [ 34 153]]
# True Negatives: 13
# False Positives: 17
# False Negatives: 34
# True Positives: 153
# ----------------
# Precision: 0.90
# Recall: 0.82
# Specificity: 0.43
# Accuracy: 0.76
# F1 Score: 0.86


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


BLSTMSpeechScoring(
  (acoustic_blstm): LSTM(768, 128, batch_first=True, bidirectional=True)
  (linguistic_blstm): LSTM(64, 128, batch_first=True, bidirectional=True)
  (embedding): Embedding(4000, 64)
  (acoustic_linear): Linear(in_features=256, out_features=128, bias=True)
  (linguistic_linear): Linear(in_features=256, out_features=128, bias=True)
  (final_linear): Linear(in_features=256, out_features=1, bias=True)
)

In [None]:
from torch import optim

optimizer = optim.Adam(blstm.parameters(), lr=1e-50)

trainer = Trainer(blstm, tokenizer, optimizer, loss_fn)


# Flask

In [None]:
# 參考網站 https://medium.com/@jasonhey789/colab-flask-%E7%84%A1%E6%B3%95%E9%80%A3%E7%B7%9Angrok-authtoken%E7%95%B0%E5%B8%B8-ca623fa818e2
# 到這裡取得 https://dashboard.ngrok.com/get-started/setup/windows
# !ngrok authtoken your_authtoken

!ngrok authtoken 2dQgQvL6de1ozN72tG8XSNoHnUQ_6fMDkJFDhcuBxCqfviEDW

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:

def scoring(file_path):
  df = make_dataframe(file_path)

  dataset = Dataset.from_pandas(df)
  dataset_array = dataset.map(process_waveforms, remove_columns=['audio_path'])

  acoustic_input = get_acoustic_feature(dataset_array)
  text = list(df['text'])
  score = trainer.pred(acoustic_input, text)

  score = 1 if score > judge else 0

  return f"{float(score):.2f}"

In [None]:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
from pyngrok import ngrok

app = Flask(__name__)

@app.route("/")
def home():
  return "<h1>Running Flask on Google Colab!</h1>"

@app.route('/upload', methods=['GET', 'POST'])
def upload_file():
  if request.method == 'POST':
    if 'audio_file' not in request.files:
      return '沒有音檔', 400
    file = request.files['audio_file']
    if file.filename == '':
      return '沒有選擇音檔', 400
    if file:
      filename = secure_filename(file.filename)
      upload_directory = '上傳目錄'
      if not os.path.exists(upload_directory):
        os.makedirs(upload_directory)
      file_path = os.path.join(upload_directory, filename)
      file.save(file_path)  # 保存文件

      score = scoring(file_path)

      return jsonify({'result': score})
  else:
    # GET 跑出上傳表單
    return '''
    <!doctype html>
    <title>upload audio</title>
    <h1>upload audio</h1>
    <form method=post enctype=multipart/form-data>
      <input type=file name=audio_file>
      <input type=submit value=upload>
    </form>
    '''


if __name__=='__main__':
  public_url = ngrok.connect(5000)
  print("Public URL:", public_url)
  app.run(port=5000)

Public URL: NgrokTunnel: "https://cd88-34-138-46-116.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:54:22] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:54:22] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:54:28] "GET /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:54:37] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

私たち


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:57:17] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:57:25] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

わたしたち


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:58:41] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 11:59:33] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:00:01] "GET /upload HTTP/1.1" 200 -


私たち


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:00:53] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:07:53] "GET /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:07:54] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:11:13] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

一写


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:14:15] "POST /upload HTTP/1.1" 200 -


舞台の下の自分が誰かはどうでもいい。みんなは、花火がステージに上がったとき、花火になることだけ知ってればいい。君だったんだね


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:16:01] "POST /upload HTTP/1.1" 200 -


舞台の下の自分が誰かはどうでもいい。みんなは、花火がステージに上がったとき、花火になることだけ知ってればいい。君だったんだね


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:16:50] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:23:44] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

あなた


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:24:42] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:25:24] "GET /upload HTTP/1.1" 200 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

我一池


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:26:36] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:34:04] "GET /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:34:05] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

あの人


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:35:18] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:39:14] "GET /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:39:14] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

教師


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:werkzeug:127.0.0.1 - - [09/Mar/2024 12:40:30] "POST /upload HTTP/1.1" 200 -
