In [3]:
# train_gillam_bow.py
# baseline: Bag-of-Words + TwoLayerNet (from scratch, fully in one file)

import os
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd

from utils import extract_utterances  # clean_text 포함된 Utterance 제공


# ============================================================
# 0. 순수 NumPy로 구현한 신경망 구성요소들 (layers, optimizer, TwoLayerNet)
# ============================================================

# --- basic functions ---

def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        y = np.exp(x)
        y /= y.sum(axis=1, keepdims=True)
    else:
        x = x - np.max(x)
        y = np.exp(x) / np.sum(np.exp(x))
    return y


def cross_entropy_error(y, t):
    """y: softmax 출력, t: one-hot 또는 정수 라벨"""
    if y.ndim == 1:
        y = y.reshape(1, -1)
        t = t.reshape(1, -1)

    # one-hot이면 정수 라벨로 변환
    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]
    # 작은 값 더해서 log(0) 방지
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


# --- layers ---

class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        self.x = x
        out = x.dot(W) + b
        return out

    def backward(self, dout):
        W, b = self.params
        dx = dout.dot(W.T)
        dW = self.x.T.dot(dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx


class Sigmoid:
    def __init__(self):
        self.params = []
        self.grads = []
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.y = None  # softmax 결과
        self.t = None  # 정답 레이블(one-hot 또는 정수)

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        # t가 one-hot인 경우
        if self.t.size == self.y.size:
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx /= batch_size

        return dx * dout


# --- optimizer ---

class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for p, g in zip(params, grads):
            p -= self.lr * g


# --- TwoLayerNet ---

class TwoLayerNet:
    """
    입력층 - 은닉층 - 출력층 2층 신경망 (완전 수제 구현)
    """
    def __init__(self, input_size, hidden_size, output_size):
        I, H, O = input_size, hidden_size, output_size

        # 가중치 초기화 (작은 값)
        W1 = 0.01 * np.random.randn(I, H).astype(np.float32)
        b1 = np.zeros(H, dtype=np.float32)
        W2 = 0.01 * np.random.randn(H, O).astype(np.float32)
        b2 = np.zeros(O, dtype=np.float32)

        # 레이어 구성
        self.layers = [
            Affine(W1, b1),
            Sigmoid(),
            Affine(W2, b2)
        ]
        self.loss_layer = SoftmaxWithLoss()

        # 모든 파라미터, 기울기 모으기
        self.params = []
        self.grads = []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def predict(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def forward(self, x, t):
        score = self.predict(x)
        loss = self.loss_layer.forward(score, t)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout


# ============================================================
# 1. CHA → 한 아이의 텍스트로 변환
# ============================================================

def load_child_text(cha_path: Path, speakers=('CHI',)) -> str:
    """
    한 .cha 파일에서 지정 화자(speakers)의 clean_text를 전부 이어붙인 문자열 반환
    """
    utts = extract_utterances(str(cha_path), list(speakers))
    texts = [u.clean_text for u in utts if u.clean_text]

    # 발화가 아예 없으면 빈 문자열
    return " ".join(texts)


# ============================================================
# 2. CSV에서 subjects 로딩
# ============================================================

def load_split_csv(csv_path: Path, base_dir: Path, speakers=('CHI',)):
    """
    gillam_train/dev/test.csv 를 읽고
    각 row(=subject)에 대해 CHI 발화를 모두 모은 텍스트를 생성
    """
    df = pd.read_csv(csv_path)

    texts = []
    labels = []

    label_to_idx = {"SLI": 0, "TD": 1}

    for _, row in df.iterrows():
        cha_rel = row["filename"]          # 예: 'gillam/SLI/5m/xxx.cha'
        cha_path = (base_dir / cha_rel).resolve()

        text = load_child_text(cha_path, speakers=speakers)

        # 혹시라도 비어 있으면 스킵 (원하면 assert로 변경 가능)
        if len(text.strip()) == 0:
            print(f"[WARN] Empty utterance: {cha_path}")
            continue

        texts.append(text)
        labels.append(label_to_idx[row["group"]])

    labels = np.array(labels, dtype=np.int64)
    return texts, labels


# ============================================================
# 3. Vocab + Bag-of-Words
# ============================================================

def build_vocab(texts, min_freq=1, max_size=None):
    counter = Counter()
    for t in texts:
        counter.update(t.lower().split())

    # <unk> 토큰 포함
    vocab = ["<unk>"]
    for w, c in counter.most_common():
        if c < min_freq:
            continue
        vocab.append(w)
        if max_size is not None and len(vocab) >= max_size:
            break

    word_to_id = {w: i for i, w in enumerate(vocab)}
    return vocab, word_to_id


def text_to_bow(text, word_to_id):
    vec = np.zeros(len(word_to_id), dtype=np.float32)
    for w in text.lower().split():
        idx = word_to_id.get(w, word_to_id["<unk>"])
        vec[idx] += 1.0
    return vec


def build_bow_matrix(texts, labels, word_to_id):
    X = np.stack([text_to_bow(t, word_to_id) for t in texts])  # (N, V)
    T = np.eye(2, dtype=np.float32)[labels]  # one-hot targets (SLI=0, TD=1)
    return X, T


# ============================================================
# 4. 학습 & 평가 유틸
# ============================================================

def evaluate_accuracy(model, X, T):
    y_true = np.argmax(T, axis=1)
    scores = model.predict(X)
    y_pred = np.argmax(scores, axis=1)
    acc = (y_true == y_pred).mean()
    return acc, y_true, y_pred


def print_classification_metrics(y_true, y_pred, label_names=("SLI", "TD")):
    """
    accuracy, confusion matrix, per-class precision/recall/F1, macro F1 출력
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # 전체 accuracy
    acc = (y_true == y_pred).mean()

    # confusion matrix (2x2)
    # 가정: 0 → SLI, 1 → TD
    tp_1 = np.sum((y_true == 1) & (y_pred == 1))
    tn_1 = np.sum((y_true == 0) & (y_pred == 0))
    fp_1 = np.sum((y_true == 0) & (y_pred == 1))
    fn_1 = np.sum((y_true == 1) & (y_pred == 0))

    print("\nConfusion Matrix (rows = true, cols = pred):")
    print("          pred_SLI   pred_TD")
    print(f"true_SLI   {tn_1:7d}   {fp_1:7d}")
    print(f"true_TD    {fn_1:7d}   {tp_1:7d}")

    # per-class metrics
    per_class = []
    for i, label in enumerate(label_names):
        tp = np.sum((y_true == i) & (y_pred == i))
        fp = np.sum((y_true != i) & (y_pred == i))
        fn = np.sum((y_true == i) & (y_pred != i))
        support = np.sum(y_true == i)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

        per_class.append((label, precision, recall, f1, support))

    macro_precision = np.mean([x[1] for x in per_class])
    macro_recall    = np.mean([x[2] for x in per_class])
    macro_f1        = np.mean([x[3] for x in per_class])

    print("\nPer-class metrics:")
    print("Class   Precision   Recall   F1-score   Support")
    for label, p, r, f1, sup in per_class:
        print(f"{label:5s}  {p:9.4f}  {r:7.4f}  {f1:9.4f}   {sup:7d}")

    print("\nMacro-averaged:")
    print(f"Precision: {macro_precision:.4f}, Recall: {macro_recall:.4f}, F1: {macro_f1:.4f}")
    print(f"\nOverall Accuracy: {acc:.4f}")


# ============================================================
# 5. 메인: 학습 루프
# ============================================================

def main():
    # 작업 디렉토리 기준 (예: /DL_project2/gillam 에서 실행)
    base_dir = Path(".").resolve()

    train_csv = base_dir / "gillam_train.csv"
    dev_csv   = base_dir / "gillam_dev.csv"
    test_csv  = base_dir / "gillam_test.csv"

    # 1) train/dev/test 텍스트 & 라벨 로딩 (CHA → clean_text)
    print("Loading train/dev/test splits ...")
    train_texts, train_labels = load_split_csv(train_csv, base_dir, speakers=("CHI",))
    dev_texts, dev_labels     = load_split_csv(dev_csv, base_dir, speakers=("CHI",))
    test_texts, test_labels   = load_split_csv(test_csv, base_dir, speakers=("CHI",))

    print(f"Train subjects: {len(train_texts)}, Dev: {len(dev_texts)}, Test: {len(test_texts)}")

    # 2) vocab 은 train 기준으로 생성
    vocab, word_to_id = build_vocab(train_texts, min_freq=1, max_size=None)
    print(f"Vocab size: {len(vocab)}")

    # 3) BoW 행렬로 변환
    X_train, T_train = build_bow_matrix(train_texts, train_labels, word_to_id)
    X_dev, T_dev     = build_bow_matrix(dev_texts, dev_labels, word_to_id)
    X_test, T_test   = build_bow_matrix(test_texts, test_labels, word_to_id)

    input_size = X_train.shape[1]   # vocab size
    hidden_size = 100               # 적당히 (나중에 튜닝)
    output_size = 2                 # SLI / TD

    print(f"Input size: {input_size}, Hidden size: {hidden_size}, Output size: {output_size}")

    # 4) 모델 & 옵티마이저 생성
    model = TwoLayerNet(input_size=input_size,
                        hidden_size=hidden_size,
                        output_size=output_size)
    optimizer = SGD(lr=0.1)

    # 5) 학습 하이퍼파라미터
    max_epoch = 30
    batch_size = 8

    data_size = X_train.shape[0]
    max_iters = max(1, data_size // batch_size)

    # 6) 학습 루프
    for epoch in range(max_epoch):
        # shuffle
        idx = np.random.permutation(data_size)
        X_train = X_train[idx]
        T_train = T_train[idx]

        total_loss, loss_cnt = 0.0, 0

        for it in range(max_iters):
            batch_x = X_train[it * batch_size:(it + 1) * batch_size]
            batch_t = T_train[it * batch_size:(it + 1) * batch_size]

            loss = model.forward(batch_x, batch_t)
            model.backward()
            optimizer.update(model.params, model.grads)

            total_loss += loss
            loss_cnt += 1

        avg_loss = total_loss / max(1, loss_cnt)

        # 매 epoch 끝날 때 dev accuracy 찍기
        dev_acc, _, _ = evaluate_accuracy(model, X_dev, T_dev)
        print(f"[Epoch {epoch+1:02d}] loss={avg_loss:.4f}, dev_acc={dev_acc:.4f}")

    # 7) 최종 test 평가 + metrics 출력
    test_acc, y_true, y_pred = evaluate_accuracy(model, X_test, T_test)
    print(f"\n[TEST] accuracy = {test_acc:.4f}")
    print_classification_metrics(y_true, y_pred, label_names=("SLI", "TD"))


if __name__ == "__main__":
    main()


Loading train/dev/test splits ...
Train subjects: 540, Dev: 68, Test: 68
Vocab size: 3700
Input size: 3700, Hidden size: 100, Output size: 2
[Epoch 01] loss=0.5827, dev_acc=0.7500
[Epoch 02] loss=0.5550, dev_acc=0.7500
[Epoch 03] loss=0.4818, dev_acc=0.7500
[Epoch 04] loss=0.4826, dev_acc=0.8088
[Epoch 05] loss=0.4789, dev_acc=0.8235
[Epoch 06] loss=0.4488, dev_acc=0.6618
[Epoch 07] loss=0.4757, dev_acc=0.7353
[Epoch 08] loss=0.4411, dev_acc=0.7353
[Epoch 09] loss=0.4226, dev_acc=0.7941
[Epoch 10] loss=0.4288, dev_acc=0.7941
[Epoch 11] loss=0.4231, dev_acc=0.7941
[Epoch 12] loss=0.4016, dev_acc=0.8382
[Epoch 13] loss=0.4092, dev_acc=0.7647
[Epoch 14] loss=0.4214, dev_acc=0.7500
[Epoch 15] loss=0.4005, dev_acc=0.7500
[Epoch 16] loss=0.3929, dev_acc=0.7941
[Epoch 17] loss=0.3716, dev_acc=0.8088
[Epoch 18] loss=0.3525, dev_acc=0.8088
[Epoch 19] loss=0.3975, dev_acc=0.7353
[Epoch 20] loss=0.3629, dev_acc=0.8382
[Epoch 21] loss=0.3489, dev_acc=0.8235
[Epoch 22] loss=0.3828, dev_acc=0.8235
[