In [1]:
# train_gillam_w2v.py
# B) Word2Vec (Simple CBOW, no one-hot) + TwoLayerNet classifier
# - Word2Vec: 정수 ID + Embedding, full softmax
# - Classifier: TwoLayerNet (from scratch)
# - Feature: 각 아이(CHI 발화 전체)의 단어 임베딩 평균

import os
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd

from utils import extract_utterances  # clean_text 포함된 Utterance 제공


# ============================================================
# 0. 공통 유틸 (softmax, cross-entropy 등)
# ============================================================

def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        y = np.exp(x)
        y /= y.sum(axis=1, keepdims=True)
    else:
        x = x - np.max(x)
        y = np.exp(x) / np.sum(np.exp(x))
    return y


def cross_entropy_error(y, t):
    """y: softmax 출력, t: one-hot 또는 정수 라벨"""
    if y.ndim == 1:
        y = y.reshape(1, -1)
        t = t.reshape(1, -1)

    # one-hot이면 정수 라벨로 변환
    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


# ============================================================
# 1. 레이어들 (Embedding, MatMul, Affine, Sigmoid, SoftmaxWithLoss)
# ============================================================

class Embedding:
    def __init__(self, W):
        # W: (V, D)
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        # idx: (N,)
        W, = self.params
        self.idx = idx
        out = W[idx]  # (N, D)
        return out

    def backward(self, dout):
        # dout: (N, D)
        W, = self.params
        dW = self.grads[0]
        dW[...] = 0  # 초기화
        np.add.at(dW, self.idx, dout)  # 같은 단어 여러 번 등장할 수 있으므로 add.at 사용
        return None


class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None

    def forward(self, x):
        W, = self.params
        self.x = x
        out = x.dot(W)
        return out

    def backward(self, dout):
        W, = self.params
        dx = dout.dot(W.T)
        dW = self.x.T.dot(dout)
        self.grads[0][...] = dW
        return dx


class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        self.x = x
        out = x.dot(W) + b
        return out

    def backward(self, dout):
        W, b = self.params
        dx = dout.dot(W.T)
        dW = self.x.T.dot(dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx


class Sigmoid:
    def __init__(self):
        self.params = []
        self.grads = []
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        # one-hot 인 경우
        if self.t.size == self.y.size:
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx /= batch_size

        return dx * dout


# ============================================================
# 2. Optimizer (SGD)
# ============================================================

class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for p, g in zip(params, grads):
            p -= self.lr * g


# ============================================================
# 3. TwoLayerNet (Classifier)
# ============================================================

class TwoLayerNet:
    """
    입력층 - 은닉층 - 출력층 2층 신경망 (classifier)
    """
    def __init__(self, input_size, hidden_size, output_size):
        I, H, O = input_size, hidden_size, output_size

        W1 = 0.01 * np.random.randn(I, H).astype(np.float32)
        b1 = np.zeros(H, dtype=np.float32)
        W2 = 0.01 * np.random.randn(H, O).astype(np.float32)
        b2 = np.zeros(O, dtype=np.float32)

        self.layers = [
            Affine(W1, b1),
            Sigmoid(),
            Affine(W2, b2)
        ]
        self.loss_layer = SoftmaxWithLoss()

        self.params = []
        self.grads = []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def predict(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def forward(self, x, t):
        score = self.predict(x)
        loss = self.loss_layer.forward(score, t)
        return loss

    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout


# ============================================================
# 4. Simple CBOW (Word2Vec, no one-hot)
# ============================================================

class SimpleCBOW:
    """
    window_size = 1 가정 (양 옆 1개씩, context 2개)
    contexts: (N, 2) 정수 ID
    target  : (N,)   정수 ID
    """
    def __init__(self, vocab_size, hidden_size):
        V, H = vocab_size, hidden_size

        W_in = 0.01 * np.random.randn(V, H).astype(np.float32)
        W_out = 0.01 * np.random.randn(H, V).astype(np.float32)

        self.in_layer0 = Embedding(W_in)
        self.in_layer1 = Embedding(W_in)
        self.out_layer = MatMul(W_out)
        self.loss_layer = SoftmaxWithLoss()

        layers = [self.in_layer0, self.in_layer1, self.out_layer]
        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        self.word_vecs = W_in  # 입력 가중치가 곧 단어 임베딩

    def forward(self, contexts, target):
        # contexts: (N, 2) int
        # target  : (N,)   int
        h0 = self.in_layer0.forward(contexts[:, 0])
        h1 = self.in_layer1.forward(contexts[:, 1])
        h = 0.5 * (h0 + h1)  # (N, H)
        score = self.out_layer.forward(h)  # (N, V)
        loss = self.loss_layer.forward(score, target)
        return loss

    def backward(self, dout=1):
        ds = self.loss_layer.backward(dout)      # (N, V)
        dh = self.out_layer.backward(ds)         # (N, H)
        dh *= 0.5
        self.in_layer1.backward(dh)              # in_layer backward는 dW만 누적, dx는 사용 안 함
        self.in_layer0.backward(dh)
        return None


# ============================================================
# 5. Word2Vec 데이터 유틸
# ============================================================

def build_vocab(texts, min_freq=1, max_size=None):
    counter = Counter()
    for t in texts:
        counter.update(t.lower().split())

    vocab = ["<unk>"]
    for w, c in counter.most_common():
        if c < min_freq:
            continue
        vocab.append(w)
        if max_size is not None and len(vocab) >= max_size:
            break

    word_to_id = {w: i for i, w in enumerate(vocab)}
    return vocab, word_to_id


def build_corpus(texts, word_to_id):
    """
    여러 텍스트를 하나의 corpus로 이어붙인 단어 ID 시퀀스 (1D ndarray) 생성
    """
    corpus = []
    unk_id = word_to_id["<unk>"]
    for t in texts:
        for w in t.lower().split():
            corpus.append(word_to_id.get(w, unk_id))
    return np.array(corpus, dtype=np.int64)


def create_contexts_target(corpus, window_size=1):
    """
    window_size=1 기준 CBOW용 contexts, target 생성
    contexts: (N, 2) 정수 ID
    target  : (N,)   정수 ID
    """
    contexts = []
    targets = []
    N = len(corpus)
    for idx in range(window_size, N - window_size):
        target = corpus[idx]
        left = corpus[idx - window_size]
        right = corpus[idx + window_size]
        contexts.append([left, right])
        targets.append(target)
    return np.array(contexts, dtype=np.int64), np.array(targets, dtype=np.int64)


def train_word2vec(texts, vocab, word_to_id,
                   embedding_dim=50,
                   window_size=1,
                   lr=0.1,
                   batch_size=64,
                   max_epoch=10):
    """
    train_texts(=주로 train split)로 Word2Vec(Simple CBOW) 학습
    one-hot을 전혀 만들지 않으므로 메모리 사용량이 훨씬 적음.
    """
    print("Building corpus for Word2Vec...")
    corpus = build_corpus(texts, word_to_id)
    vocab_size = len(vocab)
    print(f"Corpus length: {len(corpus)}, Vocab size: {vocab_size}")

    contexts_ids, target_ids = create_contexts_target(corpus, window_size=window_size)
    print(f"Training pairs: {len(target_ids)}")

    model = SimpleCBOW(vocab_size, embedding_dim)
    optimizer = SGD(lr=lr)

    data_size = target_ids.shape[0]
    max_iters = max(1, data_size // batch_size)

    for epoch in range(max_epoch):
        idx = np.random.permutation(data_size)
        contexts_ids = contexts_ids[idx]
        target_ids_shuf = target_ids[idx]

        total_loss, loss_cnt = 0.0, 0

        for it in range(max_iters):
            batch_ctx = contexts_ids[it * batch_size:(it + 1) * batch_size]
            batch_tgt = target_ids_shuf[it * batch_size:(it + 1) * batch_size]

            loss = model.forward(batch_ctx, batch_tgt)
            model.backward()
            optimizer.update(model.params, model.grads)

            total_loss += loss
            loss_cnt += 1

        avg_loss = total_loss / max(1, loss_cnt)
        print(f"[Word2Vec][Epoch {epoch+1:02d}] loss={avg_loss:.4f}")

    print("Word2Vec training done.")
    return model.word_vecs  # (V, embedding_dim)


def text_to_w2v_vec(text, word_to_id, word_vecs):
    """
    한 텍스트를 단어 임베딩 평균으로 변환
    """
    tokens = text.lower().split()
    ids = []
    unk_id = word_to_id["<unk>"]
    for w in tokens:
        ids.append(word_to_id.get(w, unk_id))

    if len(ids) == 0:
        return np.zeros(word_vecs.shape[1], dtype=np.float32)
    ids = np.array(ids, dtype=np.int64)
    vecs = word_vecs[ids]  # (L, D)
    return vecs.mean(axis=0)


def build_w2v_matrix(texts, labels, word_to_id, word_vecs):
    X = np.stack([text_to_w2v_vec(t, word_to_id, word_vecs) for t in texts])  # (N, D)
    T = np.eye(2, dtype=np.float32)[labels]  # one-hot (SLI=0, TD=1)
    return X, T


# ============================================================
# 6. Gillam 데이터 로더 (CHA → 텍스트)
# ============================================================

def load_child_text(cha_path: Path, speakers=('CHI',)) -> str:
    utts = extract_utterances(str(cha_path), list(speakers))
    texts = [u.clean_text for u in utts if u.clean_text]
    return " ".join(texts)


def load_split_csv(csv_path: Path, base_dir: Path, speakers=('CHI',)):
    df = pd.read_csv(csv_path)

    texts = []
    labels = []

    label_to_idx = {"SLI": 0, "TD": 1}

    for _, row in df.iterrows():
        cha_rel = row["filename"]          # 예: 'gillam/SLI/5m/xxx.cha'
        cha_path = (base_dir / cha_rel).resolve()

        text = load_child_text(cha_path, speakers=speakers)

        if len(text.strip()) == 0:
            print(f"[WARN] Empty utterance: {cha_path}")
            continue

        texts.append(text)
        labels.append(label_to_idx[row["group"]])

    labels = np.array(labels, dtype=np.int64)
    return texts, labels


# ============================================================
# 7. 평가
# ============================================================

def evaluate_accuracy(model, X, T):
    y_true = np.argmax(T, axis=1)
    scores = model.predict(X)
    y_pred = np.argmax(scores, axis=1)
    acc = (y_true == y_pred).mean()
    return acc, y_true, y_pred


def print_classification_metrics(y_true, y_pred, label_names=("SLI", "TD")):
    """
    accuracy, confusion matrix, per-class precision/recall/F1, macro F1 출력
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # 전체 accuracy
    acc = (y_true == y_pred).mean()

    # confusion matrix (2x2)
    # 가정: 0 → SLI, 1 → TD
    tp_1 = np.sum((y_true == 1) & (y_pred == 1))
    tn_1 = np.sum((y_true == 0) & (y_pred == 0))
    fp_1 = np.sum((y_true == 0) & (y_pred == 1))
    fn_1 = np.sum((y_true == 1) & (y_pred == 0))

    print("\nConfusion Matrix (rows = true, cols = pred):")
    print("          pred_SLI   pred_TD")
    print(f"true_SLI   {tn_1:7d}   {fp_1:7d}")
    print(f"true_TD    {fn_1:7d}   {tp_1:7d}")

    # per-class metrics
    per_class = []
    for i, label in enumerate(label_names):
        tp = np.sum((y_true == i) & (y_pred == i))
        fp = np.sum((y_true != i) & (y_pred == i))
        fn = np.sum((y_true == i) & (y_pred != i))
        support = np.sum(y_true == i)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

        per_class.append((label, precision, recall, f1, support))

    macro_precision = np.mean([x[1] for x in per_class])
    macro_recall    = np.mean([x[2] for x in per_class])
    macro_f1        = np.mean([x[3] for x in per_class])

    print("\nPer-class metrics:")
    print("Class   Precision   Recall   F1-score   Support")
    for label, p, r, f1, sup in per_class:
        print(f"{label:5s}  {p:9.4f}  {r:7.4f}  {f1:9.4f}   {sup:7d}")

    print("\nMacro-averaged:")
    print(f"Precision: {macro_precision:.4f}, Recall: {macro_recall:.4f}, F1: {macro_f1:.4f}")
    print(f"\nOverall Accuracy: {acc:.4f}")


# ============================================================
# 8. 메인
# ============================================================

def main():
    base_dir = Path(".").resolve()

    train_csv = base_dir / "gillam_train.csv"
    dev_csv   = base_dir / "gillam_dev.csv"
    test_csv  = base_dir / "gillam_test.csv"

    # 1) train/dev/test 텍스트 & 라벨 로딩
    print("Loading train/dev/test splits ...")
    train_texts, train_labels = load_split_csv(train_csv, base_dir, speakers=("CHI",))
    dev_texts, dev_labels     = load_split_csv(dev_csv, base_dir, speakers=("CHI",))
    test_texts, test_labels   = load_split_csv(test_csv, base_dir, speakers=("CHI",))

    print(f"Train subjects: {len(train_texts)}, Dev: {len(dev_texts)}, Test: {len(test_texts)}")

    # 2) train 텍스트 기준 vocab 생성
    vocab, word_to_id = build_vocab(train_texts, min_freq=1, max_size=None)
    print(f"Vocab size: {len(vocab)}")

    # 3) Word2Vec(Simple CBOW) 학습 (train 텍스트만 사용)
    embedding_dim = 50
    w2v_lr = 0.05          # 살짝 낮춰도 됨
    w2v_batch_size = 128   # 배치 크기 키워도 메모리 OK
    w2v_epochs = 5         # 처음엔 5 정도로 짧게

    word_vecs = train_word2vec(
        train_texts,
        vocab,
        word_to_id,
        embedding_dim=embedding_dim,
        window_size=1,
        lr=w2v_lr,
        batch_size=w2v_batch_size,
        max_epoch=w2v_epochs
    )

    # 4) 각 subject 텍스트를 임베딩 평균 벡터로 변환
    X_train, T_train = build_w2v_matrix(train_texts, train_labels, word_to_id, word_vecs)
    X_dev, T_dev     = build_w2v_matrix(dev_texts, dev_labels, word_to_id, word_vecs)
    X_test, T_test   = build_w2v_matrix(test_texts, test_labels, word_to_id, word_vecs)

    input_size = embedding_dim         # Word2Vec 차원
    hidden_size = 100                  # classifier hidden size
    output_size = 2                    # SLI / TD

    print(f"Classifier input size: {input_size}, hidden size: {hidden_size}")

    model = TwoLayerNet(input_size=input_size,
                        hidden_size=hidden_size,
                        output_size=output_size)
    optimizer = SGD(lr=0.1)

    max_epoch = 30
    batch_size = 8
    data_size = X_train.shape[0]
    max_iters = max(1, data_size // batch_size)

    # 5) classifier 학습 루프
    for epoch in range(max_epoch):
        idx = np.random.permutation(data_size)
        X_train = X_train[idx]
        T_train = T_train[idx]

        total_loss, loss_cnt = 0.0, 0

        for it in range(max_iters):
            batch_x = X_train[it * batch_size:(it + 1) * batch_size]
            batch_t = T_train[it * batch_size:(it + 1) * batch_size]

            loss = model.forward(batch_x, batch_t)
            model.backward()
            optimizer.update(model.params, model.grads)

            total_loss += loss
            loss_cnt += 1

        avg_loss = total_loss / max(1, loss_cnt)

        dev_acc, _, _ = evaluate_accuracy(model, X_dev, T_dev)
        print(f"[Classifier][Epoch {epoch+1:02d}] loss={avg_loss:.4f}, dev_acc={dev_acc:.4f}")

    # 6) 최종 test 평가 + metrics 출력
    test_acc, y_true, y_pred = evaluate_accuracy(model, X_test, T_test)
    print(f"\n[TEST] accuracy = {test_acc:.4f}")
    print_classification_metrics(y_true, y_pred, label_names=("SLI", "TD"))


if __name__ == "__main__":
    main()


Loading train/dev/test splits ...
Train subjects: 540, Dev: 68, Test: 68
Vocab size: 3700
Building corpus for Word2Vec...
Corpus length: 203605, Vocab size: 3700
Training pairs: 203603
[Word2Vec][Epoch 01] loss=8.2143
[Word2Vec][Epoch 02] loss=8.1877
[Word2Vec][Epoch 03] loss=7.7430
[Word2Vec][Epoch 04] loss=7.0919
[Word2Vec][Epoch 05] loss=6.6839
Word2Vec training done.
Classifier input size: 50, hidden size: 100
[Classifier][Epoch 01] loss=0.6236, dev_acc=0.7353
[Classifier][Epoch 02] loss=0.6269, dev_acc=0.7353
[Classifier][Epoch 03] loss=0.6452, dev_acc=0.7353
[Classifier][Epoch 04] loss=0.6128, dev_acc=0.7353
[Classifier][Epoch 05] loss=0.6413, dev_acc=0.7353
[Classifier][Epoch 06] loss=0.6133, dev_acc=0.7353
[Classifier][Epoch 07] loss=0.6496, dev_acc=0.7353
[Classifier][Epoch 08] loss=0.6476, dev_acc=0.7353
[Classifier][Epoch 09] loss=0.6143, dev_acc=0.2647
[Classifier][Epoch 10] loss=0.6226, dev_acc=0.7353
[Classifier][Epoch 11] loss=0.6236, dev_acc=0.7353
[Classifier][Epoch 12