In [7]:
import os
import random
import wikipedia

# output_wikiディレクトリを作成
os.makedirs("output_wiki", exist_ok=True)

# 全ての英語Wikipediaページのタイトルを取得する
all_titles = list(wikipedia.random(pages=5000))  # 5000ページ分のタイトルを取得

# 100個のタイトルをランダムに選択する
selected_titles = random.sample(all_titles, 100)

# 選択したタイトルの記事を取得し、保存する
for i, title in enumerate(selected_titles, start=1):
    try:
        page = wikipedia.page(title)
    except wikipedia.exceptions.PageError:
        # 記事が見つからない場合はスキップ
        continue
    except wikipedia.exceptions.DisambiguationError as e:
        # 曖昧な記事はスキップする
        print(f"Skipped disambiguation page: {e.options}")
    except Exception as e:
        print(f"Error: {e}")
    else:
        filename = os.path.join("output_wiki", f"{i:03d}.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(page.content)
        print(f"Saved article: {page.title}")

Saved article: Centrolepis aristata
Saved article: Rolls-Royce Gnome
Saved article: Badah railway station
Saved article: Georg von der Marwitz
Saved article: Taujėnai Manor
Saved article: Mount Cary
Saved article: Tasman (National Provincial Championship)
Saved article: Deerwood Arboretum and Nature Area
Saved article: Return on investment
Saved article: Ohtlik lend
Saved article: Jason Burnell
Saved article: Games People Play (book)
Saved article: Spain at the 1952 Winter Olympics
Saved article: 2011 Lexus of Las Vegas Open
Saved article: Jack Wild
Saved article: Allan Luke
Saved article: Reseda Beach
Saved article: Vilela people
Saved article: List of newspapers in Bahrain
Saved article: Squatting in Fiji
Saved article: Ph.D. (Art Farmer album)
Saved article: Euphemia Mondich
Saved article: Isoetes riparia
Saved article: Australian Salaried Medical Officers' Federation




  lis = BeautifulSoup(html).find_all('li')


Skipped disambiguation page: ['Bill Hume (footballer)', 'Bill Hume (cartoonist)', 'Billy Hume', 'Willie Hume', 'William Hume (Cape politician)', 'William Errington Hume', 'William H. Hume', 'William J. Hume', 'William Fraser Hume', 'William Hume-Williams', 'William Hume Blake', 'William Hume-Rothery', 'William Hulme (disambiguation)']
Saved article: Duplicate poker
Saved article: Arthur Sorin
Saved article: Sartor (surname)
Saved article: 1956 Ball State Cardinals football team
Saved article: Neil Lewis (journalist)
Saved article: Pemagatsel
Saved article: Feneley
Saved article: 2007–08 Euroleague
Saved article: René Olivares
Saved article: Lancaster, Newfoundland and Labrador
Saved article: Chicago P.D. season 8
Saved article: Bengt Bengtsson Oxenstierna
Saved article: Ženje
Saved article: Mshindo Msolla
Saved article: The Adventures of Milo and Otis
Saved article: Monmouth Oaks
Saved article: Harold Rosenwald
Saved article: South Carolina Highway 248
Saved article: Moallem Kalayeh Ru

In [8]:
import re
import nltk
from nltk.tokenize import sent_tokenize

# NLTKのデータをダウンロード
nltk.download('punkt')

# output_wikiディレクトリのパス
wiki_dir = "output_wiki"

# output_textディレクトリを作成
output_dir = "output_text"
os.makedirs(output_dir, exist_ok=True)

# output_wikiディレクトリ内のファイルを処理
for filename in os.listdir(wiki_dir):
    # ファイルパスを構築
    file_path = os.path.join(wiki_dir, filename)

    # ファイルを読み込む
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 記事を1文ずつに分割
    sentences = sent_tokenize(content)

    # output_textディレクトリ内にサブディレクトリを作成
    sub_dir_name = os.path.splitext(filename)[0]
    sub_dir_path = os.path.join(output_dir, sub_dir_name)
    os.makedirs(sub_dir_path, exist_ok=True)

    # 分割した文をファイルに保存
    for i, sentence in enumerate(sentences, start=1):
        sentence_filename = os.path.join(sub_dir_path, f"{i:03d}.txt")
        with open(sentence_filename, "w", encoding="utf-8") as f:
            f.write(sentence)

print("文の分割が完了しました。")

[nltk_data] Downloading package punkt to /home/souta-pqr/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


文の分割が完了しました。


In [1]:
import os
import pysbd
from transformers import pipeline
from tqdm import tqdm  # プログレスバーを表示するためのライブラリ

# 翻訳モデルの設定
fugu_translator = pipeline('translation', model='staka/fugumt-en-ja')

# output_textディレクトリのパス
input_dir = "output_text"

# output_japanese_textディレクトリを作成
output_dir = "output_japanese_text"
os.makedirs(output_dir, exist_ok=True)

# サブディレクトリの総数を取得
total_subdirs = len(os.listdir(input_dir))

# output_textディレクトリ内のサブディレクトリを処理
for i, sub_dir_name in enumerate(os.listdir(input_dir), start=1):
    sub_dir_path = os.path.join(input_dir, sub_dir_name)

    # output_japanese_textディレクトリ内にサブディレクトリを作成
    output_sub_dir_path = os.path.join(output_dir, sub_dir_name)
    os.makedirs(output_sub_dir_path, exist_ok=True)

    # サブディレクトリ内のファイルを処理
    files = os.listdir(sub_dir_path)
    for filename in tqdm(files, desc=f"Processing subdirectory {i}/{total_subdirs}", unit="file"):
        file_path = os.path.join(sub_dir_path, filename)

        # ファイルを読み込む
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 英語の文章を1文ずつに分割
        seg_en = pysbd.Segmenter(language="en", clean=False)
        sentences_en = seg_en.segment(content)

        # 英語から日本語に翻訳
        translations = fugu_translator(sentences_en)

        # 翻訳された内容をファイルに保存
        output_file_path = os.path.join(output_sub_dir_path, filename)
        with open(output_file_path, "w", encoding="utf-8") as f:
            for translation in translations:
                f.write(translation['translation_text'] + "\n")

print("翻訳が完了しました。")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 4. 音声ファイルに変換する
import os
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

os.makedirs("audio_files", exist_ok=True)

model_name = "facebook/wav2vec2-large-xlsr-japanese"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

for i, ja_sentence in enumerate(ja_sentences):
    speech = model.generate(tokenizer(ja_sentence, return_tensors="pt").input_ids)
    filename = f"audio_files/sentence_{i}.wav"
    tokenizer.save_audio(speech, filename)
    print(f"Saved audio file: {filename}")