In [37]:
from gtts import gTTS
import os.path

def download_sound(word, dirpath="public/sounds", slow=False):
    temp_file = f"{dirpath}/{word}.mp3"
    try:
        if not os.path.isfile(temp_file):
            # Create a gTTS object with Chinese language
            tts = gTTS(text=word, lang='zh-TW', slow=slow)
            # Save the audio file temporarily
            tts.save(temp_file)
    except Exception as e:
        print(f"An error occurred: {e}")
        

In [44]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_stroke_order(character="天", dirpath="public"):
    if len(character) > 1:
        for c in character: extract_stroke_order(c)
        return
    elif len(character) == 0: return
    
    animation_path = f"{dirpath}/animation/{character}.gif"
    stroke_path = f"{dirpath}/stroke/{character}.png"
    
    if os.path.isfile(animation_path) and os.path.isfile(stroke_path):
        return []
    
    base_url = "https://www.strokeorder.com"
    url = f"{base_url}/chinese/{character}"
    try:
        # Fetch the webpage content
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all divs with the specified class
        target_divs = soup.find_all('div', class_='stroke-article-content')

        # Extract image sources from these divs
        image_sources = []
        for div in target_divs:
            images = div.find_all('img')
            for img in images:
                src = img.get('src')
                if src:  # Only add if src exists
                    image_sources.append(src)

        if image_sources:
            for src in image_sources:
                if src.startswith("/assets/bishun/animation/"):
                    try:
                        absolute_url = urljoin(base_url, src)
                        img_response = requests.get(absolute_url, headers=headers)
                        img_response.raise_for_status()
                        
                        with open(animation_path, "wb") as f:
                            f.write(img_response.content)
                    except Exception as e: print(str(e))
                elif src.startswith("/assets/bishun/stroke/"):
                    try:
                        absolute_url = urljoin(base_url, src)
                        img_response = requests.get(absolute_url, headers=headers)
                        img_response.raise_for_status()
                        
                        with open(stroke_path, "wb") as f:
                            f.write(img_response.content)
                    except: pass
            
        return image_sources

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [61]:
from pypinyin import pinyin, lazy_pinyin, Style
import itertools

def get_pinyin(word):
    result = pinyin(word, heteronym=True)
    combinations = itertools.product(*result)
    result = list(combinations)
    return ["".join(r) for r in result]
    
get_pinyin("請問")

['qǐngwèn', 'qìngwèn', 'qíngwèn']

In [54]:
%pip install googletrans

import asyncio
from googletrans import Translator

async def translate(text, dest="id"):
    async with Translator() as translator:
        result = await translator.translate(text, src="zh-tw", dest=dest)
        return result.text
    
res = await translate("請問")
print(res)

Note: you may need to restart the kernel to use updated packages.
Permisi



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [68]:
import json
import itertools

def permute(segment = "[在] [安寧,特曼,我,他,你,我們,他們,你們] [家]"):
    parts = segment.split(" ")
    parts = [p.removeprefix("[").removesuffix("]").split(";") for p in parts]
    combinations = itertools.product(*parts)
    return list(combinations)

BASE_URL = "https://pratamov.github.io/zh-tw"
BASE_URL = "http://localhost:3000/zh-tw"

result = {}
with open("data.txt", "r", encoding="utf-8") as f:
    for line in f.readlines():
        if line.strip():
            title, segment = line.strip().split(" -> ")
            if title not in result:
                result[title] = []
            result[title] += permute(segment=segment.strip())

keywords = ["?", "!", ","]
dictionary = {}
decks = []

if not os.path.exists("dictionary.json"):
    with open("dictionary.json", "w", encoding="utf-8") as f:
        json.dump({}, f, ensure_ascii=False, indent=4)
        
with open("dictionary.json", "r", encoding="utf-8") as f:
    dictionary = json.load(f)
    
with open("data.result.txt", "w", encoding="utf-8") as f:
    
    for k, v in result.items():
        
        words = set()
        for item in v: words.update(item)
        for item in keywords: words.discard(item)
        
        f.write(f"# !{k}:{len(v)}-ITEMS:{len(words)}-WORDS:{words}\n")
        
        result_item = {
            "title": k,
            "item_count": len(v),
            "word_count": len(words),
            "items": []
        }
        
        for word in words:
            if word not in keywords:
                if word not in dictionary:
                    chars = [c for c in word]
                    animation = [f"{BASE_URL}/animation/{c}.gif" for c in chars]
                    stroke = [f"{BASE_URL}/stroke/{c}.png" for c in chars]
                    dictionary[word] = {
                        "animation": animation,
                        "stroke": stroke,
                        "meaning": await translate(word),
                        "pinyin": get_pinyin(word)
                    }
        
        for item in v:
            sentence = "".join([i for i in item if i not in keywords])
            download_sound(sentence)
            extract_stroke_order(sentence)
            item_str = " ".join([i for i in item])
            f.write(f"{item_str}\t{sentence}\n")
            result_item["items"].append({
                "item": item_str,
                "sound": f"{BASE_URL}/sounds/{sentence}.mp3"
            })
        
        decks.append(result_item)

with open("decks.json", "w", encoding="utf-8") as f:
    json.dump(decks, f, ensure_ascii=False, indent=4)

with open("dictionary.json", "w", encoding="utf-8") as f:
    json.dump(dictionary, f, ensure_ascii=False, indent=4)

js_content = f"""const contentData = {json.dumps(decks, indent=2, ensure_ascii=False)};\nexport default contentData;"""
with open("src/data/contents.js", "w", encoding="utf-8") as f:
    f.write(js_content)

js_content = f"""const dictionaryData = {json.dumps(dictionary, indent=2, ensure_ascii=False)};\nexport default dictionaryData;"""
with open("src/data/dictionary.js", "w", encoding="utf-8") as f:
    f.write(js_content)

# def import_decks(filepath="decks.txt"):
#     decks, _decks = {"decks": []}, {}
#     with open(filepath, "r", encoding="utf-8") as f:
#         for line in f.read().split("\n"):
#             if not line.startswith("#"):
#                 segments = line.split("\t")
#                 if len(segments) == 5:
#                     try:
#                         download_sound(segments[0])
#                         extract_stroke_order(segments[4].strip())
                        
#                         if not os.path.os.path.isfile(f"public/sounds/{segments[0].strip()}.mp3"):
#                             continue
                        
#                         is_complete = True
#                         for char in segments[4].strip():
#                             if not os.path.os.path.isfile(f"public/stroke/{char}.png"):
#                                 is_complete = False
#                             if not os.path.os.path.isfile(f"public/animation/{char}.gif"):
#                                 is_complete = False
#                         if not is_complete: continue
                        
#                         chars = [c for c in segments[4].strip()]
#                         strokes = [f"stroke/{c}.png" for c in chars]
#                         animations = [f"animation/{c}.gif" for c in chars]
                        
#                         if segments[2].strip() not in _decks: 
#                             _decks[segments[2].strip()] = []
                            
#                         _decks[segments[2].strip()].append({
#                             "mp3": f"sounds/{segments[0].strip()}.mp3",
#                             "word": segments[0].strip(),
#                             "means": segments[1].strip(),
#                             "pinyin": segments[3].strip(),
#                             "stroke": strokes,
#                             "animation": animations
#                         })
#                     except: print(f"Error {line}")
#                 else: print(line)
#     for k, v in _decks.items():
#         decks["decks"].append({
#             "name": k,
#             "items": v
#         })
        
        
#     with open("public/decks.json", "w", encoding="utf-8") as f:
#         json.dump(decks, f, indent=4, ensure_ascii=False)
    
#     js_content = f"""const data = {json.dumps(decks, indent=2, ensure_ascii=False)};\nexport default data;"""
#     with open("src/data/data.js", "w", encoding="utf-8") as f:
#         f.write(js_content)
    
# import_decks()