In [None]:
# 1. Wikipediaから記事を取得する
import wikipedia

# 100個の英語のWikipedia記事をランダムに取得する
en_wiki_articles = wikipedia.random(pages=100, langlinks=False)

In [None]:
# 2. 記事を1文ずつに分割する
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

sentences = []
for article in en_wiki_articles:
    sentences.extend(sent_tokenize(article))

In [None]:

# 3. 英語から日本語に翻訳する
from transformers import MarianMTModel, MarianTokenizer
model_name = "Helsinki-NLP/opus-mt-en-ja"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

ja_sentences = []
for en_sentence in sentences:
    encoded_en = tokenizer.encode(en_sentence, return_tensors="pt")
    encoded_ja = model.generate(encoded_en)
    ja_sentence = tokenizer.decode(encoded_ja[0], skip_special_tokens=True)
    ja_sentences.append(ja_sentence)

In [None]:
# 4. 音声ファイルに変換する
import os
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

os.makedirs("audio_files", exist_ok=True)

model_name = "facebook/wav2vec2-large-xlsr-japanese"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

for i, ja_sentence in enumerate(ja_sentences):
    speech = model.generate(tokenizer(ja_sentence, return_tensors="pt").input_ids)
    filename = f"audio_files/sentence_{i}.wav"
    tokenizer.save_audio(speech, filename)
    print(f"Saved audio file: {filename}")