# 第8章: ニューラルネット

## 70. 単語埋め込みの読み込み

In [None]:
import numpy as np
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format("../ch06/GoogleNews-vectors-negative300.bin.gz", binary=True)

vocab_size = len(model.key_to_index) + 1
embedding_dim = model.vector_size

embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)

token2id = {"<PAD>": 0}
id2token = {0: "<PAD>"}

for idx, word in enumerate(model.key_to_index, start=1):
    embedding_matrix[idx] = model[word]
    token2id[word] = idx
    id2token[idx] = word

print(f"Embedding matrix shape: {embedding_matrix.shape}")

## 71. データセットの読み込み

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format("../ch06/GoogleNews-vectors-negative300.bin.gz", binary=True)

token2id = {"<PAD>": 0}
id2token = {0: "<PAD>"}
for idx, word in enumerate(model.key_to_index, start=1):
    token2id[word] = idx
    id2token[idx] = word
    
def load_sst(path):
    df = pd.read_csv(path, sep="\t")
    examples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["sentence"]
        label = float(row["label"])
        tokens = text.split()
        input_ids = [token2id[t] for t in tokens if t in token2id]
        if len(input_ids) == 0:
            continue
        examples.append({
            "text": text,
            "label": torch.tensor([label], dtype=torch.float32),
            "input_ids": torch.tensor(input_ids, dtype=torch.long)
        })
    return examples

train_data = load_sst("../ch07/SST-2/train.tsv")
dev_data   = load_sst("../ch07/SST-2/dev.tsv")

print(f"#train: {len(train_data)}, #dev: {len(dev_data)}")
print("Example: ", train_data[0])

## 72. Bag of wordsモデルの構築

In [None]:
import torch
import torch.nn as nn

class MeanEmmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, freeze_embedding=True):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=freeze_embedding
        )
        self.linear = nn.Linear(embedding_dim, 1)
        
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        
        mask = (input_ids != 0).unsqueeze(-1)
        masked_embed = embedded * mask
        
        sum_embed = masked_embed.sum(dim=1)
        cnt = mask.sum(dim=1).clamp(min=1)
        mean_embed = sum_embed / cnt
        
        logits = self.linear(mean_embed).squeeze(1)
        return logits

## 73. モデルの学習 - 77. GPU上での学習

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from gensim.models import KeyedVectors
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score

w2v = KeyedVectors.load_word2vec_format("../ch06/GoogleNews-vectors-negative300.bin.gz", binary=True)

vocab_size = len(w2v.key_to_index) + 1
embedding_dim = w2v.vector_size

embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)

token2id = {"<PAD>": 0}
id2token = {0: "<PAD>"}
for idx, word in enumerate(w2v.key_to_index, start=1):
    embedding_matrix[idx] = w2v[word]
    token2id[word] = idx
    id2token[idx] = word
    
def load_sst(path):
    df = pd.read_csv(path, sep="\t")
    examples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["sentence"]
        label = float(row["label"])
        tokens = text.split()
        input_ids = [token2id[t] for t in tokens if t in token2id]
        if len(input_ids) == 0:
            continue
        examples.append({
            "text": text,
            "label": torch.tensor([label], dtype=torch.float32),
            "input_ids": torch.tensor(input_ids, dtype=torch.long)
        })
    return examples

train_data = load_sst("../ch07/SST-2/train.tsv")
dev_data   = load_sst("../ch07/SST-2/dev.tsv")

class MeanEmmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, freeze_embedding=True):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=freeze_embedding
        )
        self.linear = nn.Linear(embedding_dim, 1)
        
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        
        mask = (input_ids != 0).unsqueeze(-1)
        masked_embed = embedded * mask
        
        sum_embed = masked_embed.sum(dim=1)
        cnt = mask.sum(dim=1).clamp(min=1)
        mean_embed = sum_embed / cnt
        
        logits = self.linear(mean_embed).squeeze(1)
        return logits
    
def collate(batch):
    input_ids = [item["input_ids"] for item in batch]
    labels = torch.cat([item["label"] for item in batch])
    
    padded_ids = pad_sequence(input_ids, batch_first=True)
    return {"input_ids": padded_ids, "labels": labels}

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate)
dev_loader = DataLoader(dev_data, batch_size=64, shuffle=True, collate_fn=collate)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = MeanEmmbeddingClassifier(embedding_matrix).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

def train_model(model, train_loader, dev_loader):
    model.train()
    train_batch_loss = []
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        output = model(input_ids)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        train_batch_loss.append(loss.item())

    model.eval()
    dev_batch_loss = []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            output = model(input_ids)
            loss = criterion(output, labels)
            dev_batch_loss.append(loss.item())

    train_acc = eval_model(model, train_loader)
    dev_acc = eval_model(model, dev_loader)

    return model, np.mean(train_batch_loss), np.mean(dev_batch_loss), train_acc, dev_acc

def eval_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            logits = model(input_ids)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    return acc

epoch = 100
train_loss = []
dev_loss = []
train_acc = []
dev_acc = []

for epoch in tqdm(range(epoch)):
    model, train_l, dev_l, train_a, dev_a = train_model(model, train_loader, dev_loader)
    train_loss.append(train_l)
    dev_loss.append(dev_l)
    train_acc.append(train_a)
    dev_acc.append(dev_a)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}]")
        print(f"Train loss: {train_l:.4f}, Dev loss: {dev_l:.4f}")
        print(f"Train acc : {train_a:.4f}, Dev acc : {dev_a:.4f}")
        
path_saved_model = "./models/model_ex73.pt"
torch.save(model.state_dict(), path_saved_model)

## 78. 単語埋め込みのファインチューニング

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from gensim.models import KeyedVectors
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score

w2v = KeyedVectors.load_word2vec_format("../ch06/GoogleNews-vectors-negative300.bin.gz", binary=True)

vocab_size = len(w2v.key_to_index) + 1
embedding_dim = w2v.vector_size

embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)

token2id = {"<PAD>": 0}
id2token = {0: "<PAD>"}
for idx, word in enumerate(w2v.key_to_index, start=1):
    embedding_matrix[idx] = w2v[word]
    token2id[word] = idx
    id2token[idx] = word
    
def load_sst(path):
    df = pd.read_csv(path, sep="\t")
    examples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["sentence"]
        label = float(row["label"])
        tokens = text.split()
        input_ids = [token2id[t] for t in tokens if t in token2id]
        if len(input_ids) == 0:
            continue
        examples.append({
            "text": text,
            "label": torch.tensor([label], dtype=torch.float32),
            "input_ids": torch.tensor(input_ids, dtype=torch.long)
        })
    return examples

train_data = load_sst("../ch07/SST-2/train.tsv")
dev_data   = load_sst("../ch07/SST-2/dev.tsv")

class MeanEmmbeddingClassifier(nn.Module):
    def __init__(self, embedding_matrix, freeze_embedding=True):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=freeze_embedding
        )
        self.linear = nn.Linear(embedding_dim, 1)
        
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        
        mask = (input_ids != 0).unsqueeze(-1)
        masked_embed = embedded * mask
        
        sum_embed = masked_embed.sum(dim=1)
        cnt = mask.sum(dim=1).clamp(min=1)
        mean_embed = sum_embed / cnt
        
        logits = self.linear(mean_embed).squeeze(1)
        return logits
    
def collate(batch):
    input_ids = [item["input_ids"] for item in batch]
    labels = torch.cat([item["label"] for item in batch])
    
    padded_ids = pad_sequence(input_ids, batch_first=True)
    return {"input_ids": padded_ids, "labels": labels}

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate)
dev_loader = DataLoader(dev_data, batch_size=64, shuffle=True, collate_fn=collate)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = MeanEmmbeddingClassifier(embedding_matrix, freeze_embedding=False).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

def train_model(model, train_loader, dev_loader):
    model.train()
    train_batch_loss = []
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        output = model(input_ids)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        train_batch_loss.append(loss.item())

    model.eval()
    dev_batch_loss = []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            output = model(input_ids)
            loss = criterion(output, labels)
            dev_batch_loss.append(loss.item())

    train_acc = eval_model(model, train_loader)
    dev_acc = eval_model(model, dev_loader)

    return model, np.mean(train_batch_loss), np.mean(dev_batch_loss), train_acc, dev_acc

def eval_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            logits = model(input_ids)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    return acc

epoch = 100
train_loss = []
dev_loss = []
train_acc = []
dev_acc = []

for epoch in tqdm(range(epoch)):
    model, train_l, dev_l, train_a, dev_a = train_model(model, train_loader, dev_loader)
    train_loss.append(train_l)
    dev_loss.append(dev_l)
    train_acc.append(train_a)
    dev_acc.append(dev_a)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}]")
        print(f"Train loss: {train_l:.4f}, Dev loss: {dev_l:.4f}")
        print(f"Train acc : {train_a:.4f}, Dev acc : {dev_a:.4f}")
        
path_saved_model = "./models/model_ex78.pt"
torch.save(model.state_dict(), path_saved_model)

## 79. アーキテクチャの変更