In [2]:
import os
import random
import wikipedia
import concurrent.futures
from tqdm import tqdm

# output_wikiディレクトリを作成
os.makedirs("output_wiki", exist_ok=True)

# 全ての英語Wikipediaページのタイトルを取得する
all_titles = list(wikipedia.random(pages=5000))  # 5000ページ分のタイトルを取得

# 100個のタイトルをランダムに選択する
selected_titles = random.sample(all_titles, 100)

def download_page(args):
    i, title = args
    try:
        page = wikipedia.page(title)
    except wikipedia.exceptions.PageError:
        # 記事が見つからない場合はスキップ
        return
    except wikipedia.exceptions.DisambiguationError as e:
        # 曖昧な記事はスキップする
        print(f"Skipped disambiguation page: {e.options}")
    except Exception as e:
        print(f"Error: {e}")
    else:
        filename = os.path.join("output_wiki", f"{i:03d}.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(page.content)
        print(f"Saved article: {page.title}")

# 選択したタイトルの記事を取得し、保存する
with concurrent.futures.ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(download_page, enumerate(selected_titles, start=1)), total=len(selected_titles)))


  0%|          | 0/100 [00:00<?, ?it/s]

Saved article: Manteswami Kavya
Saved article: Berthelsdorf Formation
Saved article: Paraglaciecola arctica
Saved article: Glory Glory (football chant)
Saved article: Le Gheer
Saved article: Edmond N'Tiamoah


  1%|          | 1/100 [00:02<04:43,  2.87s/it]

Saved article: Horatio Stockton Howell
Saved article: 2019 World Junior Wrestling Championships
Saved article: Consumer Electronics Control
Saved article: Coal Gap School
Saved article: Una Croce senza nome
Saved article: Live and Electric at the Union Chapel
Saved article: White-rumped tanager
Saved article: Gila Wilderness
Saved article: Hassan Farid Didi
Saved article: Villa Wartholz
Saved article: Matthew Murphy
Saved article: Michigan goal




  lis = BeautifulSoup(html).find_all('li')


Skipped disambiguation page: ['Democratic Renewal Party (Angola)', 'Democratic Renovator Party (Portugal)', 'Renovation (disambiguation)', 'Democratic Party (disambiguation)']


  8%|▊         | 8/100 [00:03<00:29,  3.09it/s]

Saved article: Alileh Sar


 21%|██        | 21/100 [00:04<00:13,  5.75it/s]

Saved article: Burnham Norton
Saved article: Arnaud-François Lefèbvre
Saved article: FIL European Luge Championships 2018
Saved article: Boom (navigational barrier)
Saved article: Napan, New Brunswick
Saved article: Sovetskaya Street
Saved article: Verougstraete
Saved article: Roman Catholic Diocese of Laval
Saved article: Territorial Abbey of Montevergine
Saved article: Ricardo Cabrera Martínez
Saved article: Bobr (urban-type settlement)


 26%|██▌       | 26/100 [00:05<00:11,  6.71it/s]

Saved article: Blythe River (Tasmania)
Skipped disambiguation page: ['Walter I of Brienne', 'Walter II of Brienne', 'Walter III of Brienne', 'Walter IV of Brienne', 'Walter V of Brienne', 'Walter VI of Brienne', 'Walter IV of Enghien', 'County of Brienne']
Saved article: SMK Kok Lanas
Saved article: Luc Argand
Skipped disambiguation page: ['Buchanan County, Iowa', 'Buchanan County, Missouri', 'Buchanan County, Virginia']


 27%|██▋       | 27/100 [00:05<00:10,  6.76it/s]

Saved article: Pingyangmiao, You County


 40%|████      | 40/100 [00:05<00:04, 13.36it/s]

Saved article: The Manchester Man (novel)
Saved article: Baker Bridge train wreck
Saved article: Mangelia barbadoides


 46%|████▌     | 46/100 [00:07<00:06,  8.83it/s]

Saved article: Guy Wyndham
Saved article: McCamley
Saved article: You're Beautiful (Nathaniel Willemse song)
Saved article: Olga James


 50%|█████     | 50/100 [00:07<00:05,  9.88it/s]

Saved article: Parkenfestivalen
Saved article: Deterministic memory
Saved article: Carson City and Indian Village
Saved article: Milia-like calcinosis
Saved article: Loxocrambus mohaviellus
Saved article: Chenar Bagali
Saved article: Arnaud Desjardins


 55%|█████▌    | 55/100 [00:07<00:03, 13.28it/s]

Saved article: David Yencken
Saved article: Gianclaudio Bressa
Saved article: Jim Pena
Saved article: Book Art
Saved article: Airbus UK Broughton F.C.


 60%|██████    | 60/100 [00:08<00:03, 11.43it/s]

Skipped disambiguation page: ['Boot Hill (video game)', 'Boot Hill (film)', 'Boot Hill (role-playing game)', 'Boot Hill Bowl', 'Johnny Winter', 'Boot Hill', 'Boot Hill Museum', 'Glossary of cricket terms#B']
Saved article: Casalvecchio Siculo
Saved article: Rod Anderson (writer)
Saved article: Hendren Building
Saved article: Germany–Tanzania relations
Saved article: Tindal Bluff


 62%|██████▏   | 62/100 [00:09<00:07,  5.32it/s]

Saved article: Fengxin Road station
Saved article: European honey buzzard
Saved article: Dorothy Brandon
Saved article: 1049 Gotho
Saved article: London 1980 International Stamp Exhibition


 64%|██████▍   | 64/100 [00:09<00:06,  5.54it/s]

Saved article: Rock Creek Park Golf Course
Saved article: Safsaf massacre
Saved article: Grotella septempunctata
Saved article: Fear of the Digital Remix
Saved article: Nothing but Hope and Passion
Saved article: Alfred Worden


 77%|███████▋  | 77/100 [00:09<00:01, 13.61it/s]

Saved article: Oral pontine reticular nucleus
Saved article: Barar Deh, Dodangeh
Saved article: Uki waza


 80%|████████  | 80/100 [00:10<00:01, 11.18it/s]

Saved article: 2016 KNSB Dutch Single Distance Championships – Women's 3000 m
Saved article: 1963 Nova Scotia general election


 83%|████████▎ | 83/100 [00:11<00:02,  6.84it/s]

Saved article: USS Sagittarius
Saved article: Matam Region


 90%|█████████ | 90/100 [00:11<00:00, 10.18it/s]

Saved article: Motor unit
Saved article: Saint Michel d'Aiguilhe
Saved article: 1995 CFL season
Saved article: Rok Urbanc
Saved article: Beach Park Isles
Saved article: Mafalda of Castile


 92%|█████████▏| 92/100 [00:11<00:00, 10.52it/s]

Saved article: Herbert McCabe
Saved article: Ahrue Luster
Saved article: Peter Sainthill (died 1571)
Skipped disambiguation page: ['David Zilberman (wrestler)', 'David B. Zilberman', 'David Zilberman (economist)']
Saved article: Assistant Secretary of Defense for Health Affairs
Saved article: William Rant
Saved article: Somerset v Stewart


100%|██████████| 100/100 [00:12<00:00,  7.90it/s]

Saved article: Joel Hitt
Saved article: San Francisco Writers Grotto





In [3]:
import re
import nltk
from nltk.tokenize import sent_tokenize

# NLTKのデータをダウンロード
nltk.download('punkt')

# output_wikiディレクトリのパス
wiki_dir = "output_wiki"

# output_textディレクトリを作成
output_dir = "output_text"
os.makedirs(output_dir, exist_ok=True)

# output_wikiディレクトリ内のファイルを処理
for filename in os.listdir(wiki_dir):
    # ファイルパスを構築
    file_path = os.path.join(wiki_dir, filename)

    # ファイルを読み込む
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 記事を1文ずつに分割
    sentences = sent_tokenize(content)

    # output_textディレクトリ内にサブディレクトリを作成
    sub_dir_name = os.path.splitext(filename)[0]
    sub_dir_path = os.path.join(output_dir, sub_dir_name)
    os.makedirs(sub_dir_path, exist_ok=True)

    # 分割した文をファイルに保存
    for i, sentence in enumerate(sentences, start=1):
        sentence_filename = os.path.join(sub_dir_path, f"{i:03d}.txt")
        with open(sentence_filename, "w", encoding="utf-8") as f:
            f.write(sentence)

print("文の分割が完了しました。")

文の分割が完了しました。


[nltk_data] Downloading package punkt to /home/kobori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import os
import textwrap
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm import tqdm  # プログレスバーを表示するためのライブラリ

# 翻訳モデルの設定
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# output_textディレクトリのパス
input_dir = "output_text"

# output_japanese_textディレクトリを作成
output_dir = "output_japanese_text"
os.makedirs(output_dir, exist_ok=True)

# サブディレクトリの総数を取得
total_subdirs = len(os.listdir(input_dir))

# output_textディレクトリ内のサブディレクトリを処理
for i, sub_dir_name in enumerate(os.listdir(input_dir), start=1):
    sub_dir_path = os.path.join(input_dir, sub_dir_name)

    # output_japanese_textディレクトリ内にサブディレクトリを作成
    output_sub_dir_path = os.path.join(output_dir, sub_dir_name)
    os.makedirs(output_sub_dir_path, exist_ok=True)

    # サブディレクトリ内のファイルを処理
    files = os.listdir(sub_dir_path)
    for filename in tqdm(files, desc=f"Processing subdirectory {i}/{total_subdirs}", unit="file"):
        file_path = os.path.join(sub_dir_path, filename)

        # ファイルを読み込む
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 文章を小さな部分に分割
        sentences = textwrap.wrap(content, width=100)

        translated_sentences = []
        for sent in sentences:
            # 文章をトークナイザーでトークナイズし、モデルが理解できる形式に変換
            inputs = tokenizer(sent, return_tensors="pt")

            # 翻訳の実行
            generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["ja_XX"])
            translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

            translated_sentences.append(translated[0])

        # 翻訳された内容をファイルに保存
        output_file_path = os.path.join(output_sub_dir_path, filename)
        with open(output_file_path, "w", encoding="utf-8") as f:
            for translation in translated_sentences:
                f.write(translation + "\n")

print("翻訳が完了しました。")


Processing subdirectory 1/90: 100%|██████████| 3/3 [00:16<00:00,  5.33s/file]
Processing subdirectory 2/90: 100%|██████████| 5/5 [00:17<00:00,  3.48s/file]
Processing subdirectory 3/90: 100%|██████████| 51/51 [03:41<00:00,  4.34s/file]
Processing subdirectory 4/90: 100%|██████████| 20/20 [01:43<00:00,  5.18s/file]
Processing subdirectory 5/90: 100%|██████████| 13/13 [00:56<00:00,  4.35s/file]
Processing subdirectory 6/90: 100%|██████████| 37/37 [04:22<00:00,  7.09s/file]
Processing subdirectory 7/90: 100%|██████████| 3/3 [00:12<00:00,  4.15s/file]
Processing subdirectory 8/90: 100%|██████████| 4/4 [00:16<00:00,  4.12s/file]
Processing subdirectory 9/90: 100%|██████████| 7/7 [00:41<00:00,  5.92s/file]
Processing subdirectory 10/90: 100%|██████████| 43/43 [04:24<00:00,  6.15s/file]
Processing subdirectory 11/90: 100%|██████████| 21/21 [02:01<00:00,  5.77s/file]
Processing subdirectory 12/90: 100%|██████████| 42/42 [05:06<00:00,  7.30s/file]
Processing subdirectory 13/90: 100%|██████████|

翻訳が完了しました。





In [2]:
import os
from tqdm import tqdm
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf

# モデル名
model_tag = "kan-bayashi/jsut_full_band_vits_prosody"
vocoder_tag = "parallel_wavegan/jsut_parallel_wavegan.v1"

# 音声合成器の生成
text2speech = Text2Speech.from_pretrained(
    model_tag=model_tag,
    vocoder_tag=vocoder_tag,
)

# output_japanese_textディレクトリのパス
input_dir = "output_japanese_text"

# output_audioディレクトリを作成
output_dir = "output_audio"
os.makedirs(output_dir, exist_ok=True)

# output_japanese_textディレクトリ内のサブディレクトリを処理
for sub_dir_name in os.listdir(input_dir):
    sub_dir_path = os.path.join(input_dir, sub_dir_name)
    
    # output_audioディレクトリ内にサブディレクトリを作成
    output_sub_dir_path = os.path.join(output_dir, sub_dir_name)
    os.makedirs(output_sub_dir_path, exist_ok=True)
    
    # サブディレクトリ内のファイルを処理
    files = os.listdir(sub_dir_path)
    for i, filename in enumerate(tqdm(sorted(files), desc=f"Processing {sub_dir_name}", unit="file"), start=1):
        file_path = os.path.join(sub_dir_path, filename)
        
        # ファイルを読み込む
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        # 音声合成
        result = text2speech(content)
        
        # 音声ファイルの保存
        output_file_path = os.path.join(output_sub_dir_path, f"{i:03}.wav")
        sf.write(output_file_path, result["wav"], text2speech.fs, "PCM_16")

print("音声ファイルの作成が完了しました。")

 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249
Processing 078: 100%|██████████| 3/3 [00:22<00:00,  7.62s/file]
Processing 080: 100%|██████████| 5/5 [00:30<00:00,  6.05s/file]
Processing 074:   4%|▍         | 2/51 [00:14<05:28,  6.70s/file]