# WhisperTranslator

N46Whisper is a Google Colab notebook application for streamlined video subtitle file generation.The original purpose of the project was to improve the productivity of Nogizaka46 (and Sakamichi groups) subbers. However, it can also be used to create subtitles in general.The application could significantly reduce the labour and time costs of sub-groups or individual subbers. However, despite its impressive performance, the Whisper model, AI translation and the application itself are not without limitations.


WhisperTranslator 是基于 [N46Whisper](https://github.com/Ayanaminn/N46Whisper) 的应用。开发初衷旨在提高各类外文视频的转录、翻译、总结效率。

此应用基于AI语音识别模型 [Whisper](https://github.com/openai/whisper)的优化部署 [faster-whisper](https://github.com/guillaumekln/faster-whisper).

应用输出文件为ass或srt格式，内置指定字幕组的字幕格式，可直接导入 [Aegisub](https://github.com/Aegisub/Aegisub) 进行后续翻译及时间轴校正。你可以根据选项决定是否启动全文摘录和总结。


## 更新/What's Latest：

2024.2.20:
* release初版，提供转录和输出为分割文章。

# 环境安装

In [None]:
# 首先需要安装pytorch
# https://pytorch.org/get-started/locally/

In [None]:
! pip install ffmpeg
! pip install pysubs2
! pip install faster-whisper

# 基础配置

你只需要在这里修改配置，之后只需要一路执行。

In [None]:
# 基础参数设置
from typing import Optional
work_dir:str  = '' #  需要处理的视频放在的文件夹
export_dir:str  = './temp' # 结果导出的文件夹
file_type:str = "audio"  # @param ["audio","video"]
language:str = "zh"  # @param ["zh","en","jp"]
model_size:str = "large-v2"  # @param ["base","small","medium", "large-v1","large-v2","large-v3"] 推荐v2，也可以试试看v3
initial_prompt:Optional[str] = '这个是一个会议对话' # 用于标注这个对话的类型，让结果更准确，比如：'这是一段会议'，'简体中文'
export_srt:str = "No"  # @param ["No","Yes"]

In [None]:
# 其他选项/Advanced settings（可以不管）

# 将存在空格的单行文本分割为多行（多句）。分割后的若干行均临时采用相同时间戳，且添加了adjust_required标记提示调整时间戳避免叠轴
# 普通分割（Modest): 当空格后的文本长度超过5个字符，则另起一行
# 全部分割（Aggressive): 只要遇到空格即另起一行
is_split:str = "No"  # @param ["No","Yes"]
split_method:str = "Modest"  # @param ["Modest","Aggressive"]
# 字幕格式（暂时默认default）
sub_style:str = "default"

# 使用VAD过滤/Use VAD filter
# 使用[Silero VAD model](https://github.com/snakers4/silero-vad)以检测并过滤音频中的无声段落（推荐小语种使用）
# 注意】使用VAD filter有优点亦有缺点，请用户自行根据音频内容决定是否启用. [关于VAD filter](https://github.com/Ayanaminn/N46Whisper/blob/main/FAQ.md)
is_vad_filter:str = "True" # @param ["True", "False"]

# 设置Beam Size
# Beam Size数值越高，在识别时探索的路径越多，这在一定范围内可以帮助提高识别准确性，但是相对的VRAM使用也会更高. 同时，Beam Size在超过5-10后有可能降低精确性，详情请见https://arxiv.org/pdf/2204.05424.pdf
# 默认设置为 5
set_beam_size:int = 5 

# 运行

注意，如果你输出的结果是中文，最好放入gpt进行处理恢复正常的标点符号，whisper对中文符号支持不好。

In [None]:
import os
import pprint

my_root_name = work_dir.split('/')[-1]
media_names = []
for root, d_names, f_names in os.walk(work_dir):
    folders = root.split('/')
    for folder in folders:
        if folder.startswith('.'):
            continue
    for d_name in d_names:
        if d_name.startswith('.'):
            d_names.remove(d_name)
    for f_name in f_names:
        # if f_name.startswith('.'):
        #     f_names.remove(f_name)
        # only add media files
        if f_name.lower().endswith(('mp3','m4a','flac','aac','wav','mp4','mkv','ts','flv')):
            media_names.append(f_name)

if not os.path.exists(export_dir):
    os.makedirs(export_dir)

print("待处理文件数：",len(media_names))
print("待处理文件：")
pprint.pprint(media_names)

In [None]:
import torch
# hf环境变量（可无视）
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['HF_HOME'] = './hf-cache'

from faster_whisper import WhisperModel
import os
import ffmpeg
from tqdm import tqdm
import time
import pysubs2
import re
from pathlib import Path

def split_text(text, max_word_count):
    def count_words(text):
        words = re.findall(r'\b\w+\b', text)
        return len(words)
    sentences = re.split(r'(?<=[,.])\s', text)  # 按照逗号和句号分割文本
    new_paragraphs = []
    current_paragraph = ''
    current_word_count = 0

    for sentence in sentences:
        sentence_word_count = count_words(sentence)
        if current_word_count + sentence_word_count <= max_word_count:
            current_paragraph += sentence + ' '
            current_word_count += sentence_word_count
        else:
            if current_word_count > 0:
                new_paragraphs.append(current_paragraph.strip())
            current_paragraph = sentence + ' '
            current_word_count = sentence_word_count

    if current_paragraph != '':
        new_paragraphs.append(current_paragraph.strip())

    return new_paragraphs

print('加载模型 Loading model...')
torch.cuda.empty_cache()
model = WhisperModel(model_size)
torch.cuda.empty_cache()

file_names = media_names
file_basenames = []
for i in range(len(file_names)):
    file_basenames.append(Path(file_names[i]).stem)
output_dir = Path(export_dir).parent.resolve()

for i in range(len(file_names)):
  file_name = file_names[i]
  #Transcribe
  file_basename = file_basenames[i]
  if file_type == "video":
    print('提取音频中 Extracting audio from video file...')
    os.system(f'ffmpeg -i {file_name} -f mp3 -ab 192000 -vn {file_basename}.mp3')
    print('提取完毕 Done.')

  tic = time.time()
  print('识别中 Transcribe in progress...')

  segments, info = model.transcribe(audio = f'{Path(work_dir) / file_name}',
                                        beam_size=set_beam_size,
                                        language=language,
                                        vad_filter=is_vad_filter,
                                        initial_prompt=initial_prompt,
                                        vad_parameters=dict(min_silence_duration_ms=1000))

  # segments is a generator so the transcription only starts when you iterate over it
  # to use pysubs2, the argument must be a segment list-of-dicts
  total_duration = round(info.duration, 2)  # Same precision as the Whisper timestamps.
  results= []
  pure_texts= []

  with tqdm(total=total_duration, unit=" seconds") as pbar:
      for s in segments:
        segment_dict = {'start':s.start,'end':s.end,'text':s.text}
        results.append(segment_dict)
        if language =='zh':
            # 用于中文情况断句，否则没有标点符号。  
            if not s.text.endswith(tuple([',', '.', '，', '。'])):
                pure_texts.append(s.text + ',')
            else:
                pure_texts.append(s.text)
        else:
            pure_texts.append(s.text)
        segment_duration = s.end - s.start
        pbar.update(segment_duration)
  full_text = ''.join(pure_texts)

  #Time comsumed
  toc = time.time()
  print('识别完毕 Done')
  print(f'Time consumpution {toc-tic}s')

  #Save full text
  new_paragraphs = split_text(full_text, max_word_count=200)
  chunk_filename = file_basename + '.txt'
  chunk_filename = Path(export_dir) / chunk_filename
  with open(chunk_filename, 'w', encoding='utf-8') as file:
      for chunk in new_paragraphs:
          file.write(chunk + '\n')

  #Save srt
  subs = pysubs2.load_from_whisper(results)
  srt_filename = file_basename + '.srt'
  srt_filename = Path(export_dir) / srt_filename
  subs.save(srt_filename)

  #Save ass
  from srt2ass import srt2ass
  ass_filename  = srt2ass(str(srt_filename), sub_style, is_split,split_method)
  print('ASS subtitle saved as: ' + ass_filename )

  print('第',i+1,'个文件字幕生成完毕/',i+1, 'file(s) was completed!')
  torch.cuda.empty_cache()

print('所有字幕生成完毕 All done!')

# 文本翻译

待办

In [None]:
sub_source = "upload_new"  # @param ["use_transcribed","upload_new"]
openai_key = '' # @param {type:"string"}
target_language = 'zh-hans'# @param ["zh-hans","english"]
prompt = "You are a language expert.Your task is to translate the input subtitle text, sentence by sentence, into the user specified target language.However, please utilize the context to improve the accuracy and quality of translation.Please be aware that the input text could contain typos and grammar mistakes, utilize the context to correct the translation.Please return only translated content and do not include the origin text.Please do not use any punctuation around the returned text.Please do not translate people's name and leave it as original language.\"" 
temperature = 0.6 
output_format = "ass"  # @param ["ass","srt"]



# 文本总结