In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd /content/gdrive/My Drive/Colab Notebooks/tensorflow-ml-nlp-tf2/5.TEXT_SIM

/content/gdrive/My Drive/Colab Notebooks/tensorflow-ml-nlp-tf2/5.TEXT_SIM


In [3]:
import os

if not os.path.exists('./data_out'):
  os.makedirs('./data_out')
else:
  print("folder already exists")

folder already exists


In [0]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn.functional as F
from torch import nn, optim

## 데이터 불러오기

In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

## 하이퍼파라미터 정의

In [0]:
model_name = 'malstm_similarity'
BATCH_SIZE = 128
NUM_EPOCHS = 10
VALID_SPLIT = 0.1

kargs = {
    'vocab_size': prepro_configs['vocab_size'],
    'embedding_dimension': 100,
    'lstm_dimension': 150,
}

## DataLoader

In [0]:
split = int(len(q1_data) * (1-VALID_SPLIT))

q1_train, q1_val = q1_data[:split], q2_data[split:]
q2_train, q2_val = q2_data[:split], q2_data[split:]
labels_train, labels_val = labels[:split], labels[split:]

In [0]:
q1_train, q1_val = torch.LongTensor(q1_train), torch.LongTensor(q1_val)
q2_train, q2_val = torch.LongTensor(q2_train), torch.LongTensor(q2_val)
labels_train, labels_val = torch.FloatTensor(labels_train), torch.FloatTensor(labels_val)

In [0]:
from torch.utils.data import Dataset, DataLoader

class TrainData(Dataset):
  def __init__(self):
    self.q1 = q1_train
    self.q2 = q2_train
    self.y = labels_train

  def __getitem__(self, s):
    return self.q1[s], self.q2[s], self.y[s]

  def __len__(self):
    return self.y.shape[0]

class ValData(Dataset):
  def __init__(self):
    self.q1 = q1_val
    self.q2 = q2_val
    self.y = labels_val

  def __getitem__(self, s):
    return self.q1[s], self.q2[s], self.y[s]

  def __len__(self):
    return self.y.shape[0]

In [0]:
dataset = TrainData()
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## 모델 정의

In [0]:
class MaLSTMSimilarity(nn.Module):
  def __init__(self, **kargs):
    super(MaLSTMSimilarity, self).__init__()
    self.vocab_size = kargs['vocab_size']
    self.embedding_dim = kargs['embedding_dimension']
    self.hidden_dim = kargs['lstm_dimension']
    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=0)
    self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, batch_first=True)
    self.fc = nn.Linear(11, 1)

  def forward(self, x):
    x1, x2 = x
    x1, x2 = self.embedding(x1), self.embedding(x2)
    x1, _ = self.lstm(x1)
    x2, _ = self.lstm(x2)
    out = torch.square(x1-x2).sum(axis=-1)
    out = self.fc(out)
    return torch.sigmoid(out)

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [0]:
q1_val, q2_val, labels_val = q1_val.to(device), q2_val.to(device), labels_val.to(device)

In [0]:
model = MaLSTMSimilarity(**kargs).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss().to(device)

In [15]:
model

MaLSTMSimilarity(
  (embedding): Embedding(76529, 100, padding_idx=0)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=11, out_features=1, bias=True)
)

In [0]:
def train(train_loader=train_loader, model=model, optimizer=optimizer, num_epochs=NUM_EPOCHS):
  model.train()

  for epoch in range(1, NUM_EPOCHS+1):
    for q1, q2, labels in train_loader:
      q1, q2, labels = q1.to(device), q2.to(device), labels.to(device)
      y_pred = model((q1, q2)).view_as(labels)
      loss = criterion(y_pred, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    print(f"Epoch: {epoch}, Train Loss: {loss.item()}")

In [17]:
train()

Epoch: 1, Train Loss: 0.12939806282520294
Epoch: 2, Train Loss: 1.9411799907684326
Epoch: 3, Train Loss: 0.08472463488578796
Epoch: 4, Train Loss: 0.12880872189998627
Epoch: 5, Train Loss: 0.0007647815509699285
Epoch: 6, Train Loss: 0.003443141933530569
Epoch: 7, Train Loss: 0.0
Epoch: 8, Train Loss: 0.02088308148086071
Epoch: 9, Train Loss: 0.0602726936340332
Epoch: 10, Train Loss: 0.001825448009185493


In [0]:
val_dataset = ValData()
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [0]:
def eval():
  model.eval()
  total_loss = 0
  acc = 0

  with torch.no_grad():
    for q1, q2, labels in val_loader:
      q1, q2, labels = q1.to(device), q2.to(device), labels.to(device)
      output = model((q2, q2)).view_as(labels)
      loss = criterion(output, labels) * BATCH_SIZE
      total_loss += loss.item()
      pred = (output.data >= 0.5).float()
      acc += (pred == labels).sum()

  return total_loss / len(val_dataset), acc.item() / len(val_dataset)

In [20]:
eval()

(0.03256742157129627, 1.0)

In [0]:
torch.save(model.state_dict(), f'./{model_name}.pth')