<a href="https://colab.research.google.com/github/shiwoei-ai/ai_tech_and_biz_app_course/blob/main/compare_dnn_cnn_202509.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**步驟 1：環境設定與導入函式庫**

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# 下載台北思源黑體
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager

fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')

# 安裝Gradio
!pip install --upgrade gradio
import gradio, sys
print("Gradio version:", gradio.__version__)
print("Python:", sys.version)

**步驟 2：載入、檢視與預處理MNIST資料集**

In [None]:
# 1.載入MNIST手寫數字資料集
# MNIST資料集包含60,000張訓練圖像與10,000張測試圖像
# 每張圖為28x28像素的灰階圖
(X_train_raw, y_train_raw), (X_test_raw, y_test_raw) = mnist.load_data()

# 看看第一張圖的資料長什麼樣子
print("第一張圖的實際樣子：\n")
np.set_printoptions(linewidth=np.inf)
print(X_train_raw[0])

# 畫出訓練資料最前面的12張圖
print("\n訓練資料最前面的12張圖：\n")
plt.figure(figsize=(5,5))
for k in range(12):
  plt.subplot(3, 4, k+1)
  plt.imshow(X_train_raw[k], cmap='gray')
plt.tight_layout()
plt.show()

# 2. 資料正規化(Normalization)
# 將像素值從0-255的範圍，縮放到0-1之間，有助於加速模型收斂
X_train_normalized = X_train_raw / 255.0
X_test_normalized = X_test_raw / 255.0

# 再看看第一張圖的資料長什麼樣子
print("\n再看看第一張圖正規化後長什麼樣子：\n")
print(X_train_normalized[0])

# 3. 類別標籤One-hot編碼
# 將數字0-9的標籤轉換為向量形式，例如3->[0,0,0,1,0,0,0,0,0,0]
n_classes = 10
y_train = to_categorical(y_train_raw, n_classes)
y_test = to_categorical(y_test_raw, n_classes)

print(f"訓練資料維度: {X_train_normalized.shape}")
print(f"測試資料維度: {X_test_normalized.shape}")
print(f"訓練標籤維度: {y_train.shape}")
print(f"測試標籤維度: {y_test.shape}")

**步驟 3：建立與訓練 DNN 模型**

In [None]:
# 1.為了DNN，需將28x28的圖像展平成784維的向量
X_train_dnn = X_train_normalized.reshape(60000, 784)
X_test_dnn = X_test_normalized.reshape(10000, 784)

# 2.建立DNN模型結構
model_dnn = Sequential([
    # input_shape只需設定在第一層
    Dense(64,activation='relu',input_shape=(784,)),
    Dropout(0.2),
    Dense(32,activation='relu'),
    Dropout(0.2),
    Dense(32,activation='relu'),
    Dropout(0.2),
    # 輸出層有10個神經元，對應0-9十個數字，使用softmax輸出機率
    Dense(n_classes, activation='softmax')
])

# 3.編譯模型 (Compile)
# optimizer:採用SGD
# loss:損失函數，多分類問題適用categorical_crossentropy
# metrics:評估指標，我們關心的是準確率(accuracy)
model_dnn.compile(loss='categorical_crossentropy',optimizer=SGD(learning_rate=0.087), metrics=['accuracy'])

# 4.顯示模型摘要
print("--- DNN 模型結構 ---")
model_dnn.summary()

# 5.訓練模型
print("\n--- 開始訓練 DNN 模型 ---")
history_dnn = model_dnn.fit(X_train_dnn, y_train,
                      epochs=20,
                      batch_size=128,
                      validation_split=0.1, # 劃分部分訓練資料做為驗證集
                      verbose=1)

**步驟 4：評估 DNN 模型成效**

In [None]:
# 1.在測試集上評估模型
print("--- 評估 DNN 模型 ---")
loss_dnn, acc_dnn = model_dnn.evaluate(X_test_dnn, y_test, verbose=0)
print(f"DNN測試集準確率 (Accuracy): {acc_dnn:.4f}")

# 2.產生預測
y_pred_dnn_proba = model_dnn.predict(X_test_dnn)
y_pred_dnn = np.argmax(y_pred_dnn_proba, axis=1)

# 3.繪製混淆矩陣
cm_dnn = confusion_matrix(y_test_raw, y_pred_dnn)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_dnn, annot=True, fmt='d', cmap='Blues')
plt.title('DNN 模型 - 混淆矩陣 (Confusion Matrix)')
plt.ylabel('真實標籤 (Actual Label)')
plt.xlabel('預測標籤 (Predicted Label)')
plt.show()

print("\n從準確率與混淆矩陣可以看出，DNN表現不錯，但在某些數字上仍有改進空間。")
print("接下來，我們將嘗試使用專為圖像設計的CNN模型，看看是否能獲得更好的結果。")

**步驟 5：建立與訓練 CNN 模型**

In [None]:
# 1.為了CNN，需將資料調整為(樣本數,高,寬,色彩頻道)的格式
# MNIST是灰階圖，所以色彩頻道(channel)為1
X_train_cnn = X_train_normalized.reshape(60000, 28, 28, 1)
X_test_cnn = X_test_normalized.reshape(10000, 28, 28, 1)

# 2.建立CNN模型結構(保持原notebook的參數)
model_cnn = Sequential([
    # 第一個卷積層：32 個3x3 的濾波器 (filter)
    # -input_shape=(28,28,1)表示輸入影像大小為28x28，單通道(灰階)
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    # 第二個卷積層：64 個 3x3 的濾波器
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    # 將特徵圖縮小一半 (2x2 池化)，減少參數量並提取重要特徵
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    # 展平層：將2D特徵圖轉為1D，以接入全連接層
    Flatten(),
    # 全連接層(Dense Layer)
    Dense(128, activation='relu'),
    Dropout(0.5),
    # 輸出層
    Dense(n_classes, activation='softmax')
])

# 3.編譯模型
model_cnn.compile(optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['accuracy'])

# 4.顯示模型摘要
print("--- CNN 模型結構 ---")
model_cnn.summary()

# 5.訓練模型
print("\n--- 開始訓練 CNN 模型 ---")
history_cnn = model_cnn.fit(X_train_cnn, y_train,
                      epochs=10,
                      batch_size=128,
                      validation_data=(X_test_cnn, y_test),
                      verbose=1)

**步驟 6：評估 CNN 模型成效**

In [None]:
# 1.在測試集上評估模型
print("--- 評估 CNN 模型 ---")
loss_cnn, acc_cnn = model_cnn.evaluate(X_test_cnn, y_test, verbose=0)
print(f"CNN 測試集準確率 (Accuracy): {acc_cnn:.4f}")

# 2.產生預測
y_pred_cnn_proba = model_cnn.predict(X_test_cnn)
y_pred_cnn = np.argmax(y_pred_cnn_proba, axis=1)

# 3.繪製混淆矩陣
cm_cnn = confusion_matrix(y_test_raw, y_pred_cnn)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_cnn, annot=True, fmt='d', cmap='Blues')
plt.title('CNN 模型 - 混淆矩陣 (Confusion Matrix)')
plt.ylabel('真實標籤 (Actual Label)')
plt.xlabel('預測標籤 (Predicted Label)')
plt.show()

**步驟 7：比較 DNN 與 CNN 成效**

In [None]:
# 最終成效比較
comparison_html = f"""
<h3 style="font-family: 'Microsoft JhengHei', sans-serif;">最終成效總結</h3>
<table border="1" style="width:60%; text-align:center; font-family: 'Arial', sans-serif; border-collapse: collapse;">
  <tr style="background-color:#f2f2f2;">
    <th style="padding: 8px;">模型 (Model)</th>
    <th style="padding: 8px;">測試集損失 (Test Loss)</th>
    <th style="padding: 8px;">測試集準確率 (Test Accuracy)</th>
  </tr>
  <tr>
    <td style="padding: 8px;"><b>深度神經網路 (DNN)</b></td>
    <td style="padding: 8px;">{loss_dnn:.4f}</td>
    <td style="padding: 8px;"><font size='+1'>{acc_dnn:.4f}</font></td>
  </tr>
  <tr>
    <td style="padding: 8px;"><b>卷積神經網路 (CNN)</b></td>
    <td style="padding: 8px;"><b>{loss_cnn:.4f}</b></td>
    <td style="padding: 8px;"><font size='+1' color='blue'><b>{acc_cnn:.4f}</b></font></td>
  </tr>
</table>
"""

display(HTML(comparison_html))

**步驟 8：互動式預測 Web App**

In [None]:
# 單數字辨識

import numpy as np
from PIL import Image
import gradio as gr
import inspect

# 1.影像前處理
def preprocess_image(img_np: np.ndarray) -> np.ndarray:
    if img_np is None:
        return np.zeros((28, 28), dtype=np.float32)
    img = Image.fromarray(img_np).convert("L")
    gray = np.array(img).astype(np.uint8)
    if gray.mean() > 127:  # 容錯：白底黑字就反相
        gray = 255 - gray
    thresh = 20
    fg = gray > thresh
    if not np.any(fg):
        return np.zeros((28, 28), dtype=np.float32)
    rows = np.where(fg.any(axis=1))[0]
    cols = np.where(fg.any(axis=0))[0]
    rmin, rmax = rows[0], rows[-1] + 1
    cmin, cmax = cols[0], cols[-1] + 1
    cropped = gray[rmin:rmax, cmin:cmax]
    h, w = cropped.shape
    side = max(h, w)
    square = np.zeros((side, side), dtype=np.uint8)
    r_off = (side - h) // 2
    c_off = (side - w) // 2
    square[r_off:r_off + h, c_off:c_off + w] = cropped
    square_img = Image.fromarray(square).resize((20, 20), Image.Resampling.LANCZOS)
    canvas = np.zeros((28, 28), dtype=np.uint8)
    canvas[4:24, 4:24] = np.array(square_img)
    return canvas.astype("float32") / 255.0

# 2.推論
def recognize_digit_dual(drawing):
    if drawing is None:
        return {}, {}
    if isinstance(drawing, dict):
        arr = drawing.get("image") or drawing.get("composite")
    else:
        arr = drawing
    if arr is None:
        return {}, {}
    processed = preprocess_image(np.array(arr))
    x_dnn = processed.reshape(1, 784)
    x_cnn = processed.reshape(1, 28, 28, 1)
    dnn_proba = model_dnn.predict(x_dnn, verbose=0)[0]
    cnn_proba = model_cnn.predict(x_cnn, verbose=0)[0]
    labels = [str(i) for i in range(10)]
    return (
        {labels[i]: float(dnn_proba[i]) for i in range(10)},
        {labels[i]: float(cnn_proba[i]) for i in range(10)}
    )

# 3.黑色底圖
def make_black_canvas(w=336, h=336, channels=3):
    if channels == 1:
        return np.zeros((h, w), dtype=np.uint8)
    return np.zeros((h, w, channels), dtype=np.uint8)

black_canvas = make_black_canvas(336, 336, 3)

# 選擇繪圖元件
use_image_editor = hasattr(gr, "ImageEditor")
sketchpad_has_style_args = all(
    x in str(inspect.signature(gr.Sketchpad.__init__))
    for x in ["background_color", "brush_color", "stroke_width"]
) if hasattr(gr, "Sketchpad") else False

with gr.Blocks(title="✏️ MNIST 手寫數字辨識 - DNN vs. CNN（黑底白筆版）") as demo:
    gr.Markdown("把數字寫在**左側黑色畫布**中央，按提交；右側會顯示兩個模型的 top-3 結果。")

    with gr.Row():
        # 左側：畫布+按鈕
        with gr.Column(scale=1):
            if use_image_editor:
                draw = gr.ImageEditor(
                    value=black_canvas, height=360, width=360,
                    label="左側畫布（請選白色筆刷、筆寬約 20）"
                )
                reset_value = black_canvas
            elif sketchpad_has_style_args:
                draw = gr.Sketchpad(
                    height=336, width=336,
                    background_color="black", brush_color="white", stroke_width=20,
                    label="左側黑色畫布"
                )
                reset_value = None
            else:
                draw = gr.Sketchpad(
                    height=336, width=336,
                    label="左側畫布（白底備案，系統會自動反相）"
                )
                reset_value = None

            with gr.Row():
                btn_submit = gr.Button("提交", variant="primary")
                btn_clear  = gr.Button("清除", variant="secondary")

        # 右側：結果
        with gr.Column(scale=1):
            dnn_label = gr.Label(num_top_classes=3, label="DNN 模型預測結果")
            cnn_label = gr.Label(num_top_classes=3, label="CNN 模型預測結果")

    btn_submit.click(fn=recognize_digit_dual, inputs=draw, outputs=[dnn_label, cnn_label])

    def _reset():
        return reset_value, {}, {}
    btn_clear.click(fn=_reset, inputs=None, outputs=[draw, dnn_label, cnn_label])

demo.launch(debug=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ee918205ce1a197d8d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
# 多數字辨識

import numpy as np
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
from collections import deque

def _center_resize_to_mnist(gray: np.ndarray) -> np.ndarray:
    # gray: uint8 0~255，黑底白字
    thresh = 20
    fg = gray > thresh
    if not np.any(fg):
        return np.zeros((28, 28), dtype=np.float32)

    rows = np.where(fg.any(axis=1))[0]
    cols = np.where(fg.any(axis=0))[0]
    rmin, rmax = rows[0], rows[-1] + 1
    cmin, cmax = cols[0], cols[-1] + 1
    cropped = gray[rmin:rmax, cmin:cmax]

    h, w = cropped.shape
    side = max(h, w)
    square = np.zeros((side, side), dtype=np.uint8)
    r_off = (side - h) // 2
    c_off = (side - w) // 2
    square[r_off:r_off + h, c_off:c_off + w] = cropped

    img20 = Image.fromarray(square).resize((20, 20), Image.Resampling.LANCZOS)
    canvas = np.zeros((28, 28), dtype=np.uint8)
    canvas[4:24, 4:24] = np.array(img20)
    return (canvas.astype("float32") / 255.0)

def _connected_components_bboxes(bin_fg: np.ndarray, min_area=60, max_components=80):
    H, W = bin_fg.shape
    visited = np.zeros_like(bin_fg, dtype=bool)
    bboxes = []
    dirs = [(1,0),(-1,0),(0,1),(0,-1)]

    for r0 in range(H):
        for c0 in range(W):
            if not bin_fg[r0, c0] or visited[r0, c0]:
                continue
            q = deque([(r0, c0)])
            visited[r0, c0] = True
            rmin=rmax=r0; cmin=cmax=c0; area=0
            while q:
                r, c = q.popleft()
                area += 1
                if r < rmin: rmin = r
                if r > rmax: rmax = r
                if c < cmin: cmin = c
                if c > cmax: cmax = c
                for dr, dc in dirs:
                    nr, nc = r+dr, c+dc
                    if 0 <= nr < H and 0 <= nc < W and bin_fg[nr, nc] and not visited[nr, nc]:
                        visited[nr, nc] = True
                        q.append((nr, nc))
            if area >= min_area:
                bboxes.append((rmin, rmax+1, cmin, cmax+1, area))
                if len(bboxes) >= max_components:
                    break
        if len(bboxes) >= max_components:
            break
    return bboxes

def detect_multi_digits(drawing):
    if drawing is None:
        return None, [], "", ""
    if isinstance(drawing, dict):
        arr = drawing.get("image") or drawing.get("composite")
    else:
        arr = drawing
    if arr is None:
        return None, [], "", ""

    rgb = np.array(arr)
    gray = np.array(Image.fromarray(rgb).convert("L")).astype(np.uint8)

    if gray.mean() > 127:
        gray = 255 - gray

    fg = gray > 20
    bboxes = _connected_components_bboxes(fg, min_area=60, max_components=80)

    bboxes.sort(key=lambda b: (b[2], b[0]))

    labels = [str(i) for i in range(10)]
    rows = []
    seq_cnn, seq_dnn = [], []

    draw_img = Image.fromarray(rgb).convert("RGB")
    draw = ImageDraw.Draw(draw_img)
    try:
        font = ImageFont.load_default()
    except:
        font = None

    for idx, (rmin, rmax, cmin, cmax, area) in enumerate(bboxes, start=1):
        crop = gray[rmin:rmax, cmin:cmax]
        mn = _center_resize_to_mnist(crop)
        xd = mn.reshape(1, 784)
        xc = mn.reshape(1, 28, 28, 1)

        proba_d = model_dnn.predict(xd, verbose=0)[0]
        proba_c = model_cnn.predict(xc, verbose=0)[0]

        pred_d, conf_d = int(np.argmax(proba_d)), float(np.max(proba_d))
        pred_c, conf_c = int(np.argmax(proba_c)), float(np.max(proba_c))

        seq_dnn.append(str(pred_d))
        seq_cnn.append(str(pred_c))

        rows.append({
            "id": idx,
            "bbox": (int(rmin), int(cmin), int(rmax), int(cmax)),
            "pred_dnn": pred_d, "conf_dnn": round(conf_d, 4),
            "pred_cnn": pred_c, "conf_cnn": round(conf_c, 4),
            "area": int(area)
        })

        draw.rectangle([cmin, rmin, cmax, rmax], outline=(0,255,0), width=2)
        draw.text((cmin+2, rmin+2), f"{pred_c}", fill=(255,255,0), font=font)

    annotated = np.array(draw_img)
    seq_text_cnn = "".join(seq_cnn)
    seq_text_dnn = "".join(seq_dnn)
    return annotated, rows, seq_text_cnn, seq_text_dnn

def make_black_canvas(h=360, w=720, channels=3):
    return np.zeros((h, w, channels), dtype=np.uint8)

black_canvas_big = make_black_canvas(360, 720, 3)

with gr.Blocks(title="🧩 多數字辨識（DNN 與 CNN）") as multi_digits_app:
    gr.Markdown("在左側**黑色大畫布**寫下多個數字（建議筆寬≈20、數字間留些間隔）。提交後右側會顯示：\
    1) 綠框＋CNN數字的標註圖，2) CNN 與 DNN 的由左到右預測序列。")

    with gr.Row():
        # 左側：畫布+按鈕
        with gr.Column(scale=1):
            if hasattr(gr, "ImageEditor"):
                canvas = gr.ImageEditor(
                    value=black_canvas_big,
                    height=380, width=760,
                    label="多數字畫布（黑底）：右上角選白色筆刷、筆寬約 20"
                )
            else:
                canvas = gr.Sketchpad(
                    height=380, width=760,
                    label="多數字畫布（若為白底，系統會自動反相）"
                )
            with gr.Row():
                btn_run = gr.Button("提交", variant="primary")
                btn_clear = gr.Button("清除", variant="secondary")

        # 右側：結果
        with gr.Column(scale=1):
            annotated_out = gr.Image(label="標註結果（綠框 + CNN 預測數字）")
            # table_out = gr.Dataframe(
            #     headers=["id","bbox","pred_dnn","conf_dnn","pred_cnn","conf_cnn","area"],
            #     datatype=["number","str","number","number","number","number","number"],
            #     label="逐個數字的預測表（DNN / CNN）",
            #     interactive=False
            # )
            seq_out_cnn = gr.Textbox(label="由左到右的 CNN 預測序列", interactive=False)
            seq_out_dnn = gr.Textbox(label="由左到右的 DNN 預測序列", interactive=False)

    # 提交推論
    btn_run.click(
        fn=detect_multi_digits,
        inputs=canvas,
        outputs=[annotated_out, table_out, seq_out_cnn, seq_out_dnn]
    )

    def _reset_multi():
        return black_canvas_big, None, [], "", ""
    btn_clear.click(
        fn=_reset_multi,
        inputs=None,
        outputs=[canvas, annotated_out, table_out, seq_out_cnn, seq_out_dnn]
    )

multi_digits_app.launch(debug=True)
