In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd /content/gdrive/My Drive/Colab Notebooks/tensorflow-ml-nlp-tf2/5.TEXT_SIM

/content/gdrive/My Drive/Colab Notebooks/tensorflow-ml-nlp-tf2/5.TEXT_SIM


In [3]:
import os

if not os.path.exists('./data_out'):
  os.makedirs('./data_out')
else:
  print("folder already exists")

folder already exists


In [0]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn.functional as F
from torch import nn, optim

## 데이터 불러오기

In [0]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

## 하이퍼파라미터 정의

In [0]:
BATCH_SIZE = 1024
NUM_EPOCHS = 100
VALID_SPLIT = 0.1
MAX_LEN = 31

kargs = {'vocab_size': prepro_configs['vocab_size'],
         'word_embedding_dimension': 100,
         'conv_num_filters': 300,
         'conv_window_size': 3,
         'max_pool_seq_len': MAX_LEN,
         'sent_embedding_dimension': 128,
         'dropout_rate': 0.2,
         'hidden_dimension': 200,
         'output_dimension':1}

## DataLoader

In [0]:
split = int(len(q1_data) * (1-VALID_SPLIT))

q1_train, q1_val = q1_data[:split], q2_data[split:]
q2_train, q2_val = q2_data[:split], q2_data[split:]
labels_train, labels_val = labels[:split], labels[split:]

In [0]:
q1_train, q1_val = torch.LongTensor(q1_train), torch.LongTensor(q1_val)
q2_train, q2_val = torch.LongTensor(q2_train), torch.LongTensor(q2_val)
labels_train, labels_val = torch.FloatTensor(labels_train), torch.FloatTensor(labels_val)

In [0]:
from torch.utils.data import Dataset, DataLoader

class TrainData(Dataset):
  def __init__(self):
    self.q1 = q1_train
    self.q2 = q2_train
    self.y = labels_train

  def __getitem__(self, s):
    return self.q1[s], self.q2[s], self.y[s]

  def __len__(self):
    return self.y.shape[0]

class ValData(Dataset):
  def __init__(self):
    self.q1 = q1_val
    self.q2 = q2_val
    self.y = labels_val

  def __getitem__(self, s):
    return self.q1[s], self.q2[s], self.y[s]

  def __len__(self):
    return self.y.shape[0]

In [0]:
dataset = TrainData()
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## 모델 정의

In [0]:
class CNNSimilarity(nn.Module):
  def __init__(self, **kargs):
    super(CNNSimilarity, self).__init__()
    self.vocab_size = kargs['vocab_size']
    self.word_embedding_dimension = kargs['word_embedding_dimension']
    self.conv_num_filters = kargs['conv_num_filters']
    self.conv_window_size = kargs['conv_window_size']
    self.max_pool_seq_len = kargs['max_pool_seq_len']
    self.sent_embedding_dimension = kargs['sent_embedding_dimension']
    self.dropout_rate = kargs['dropout_rate']
    self.hidden_dimension = kargs['hidden_dimension']
    self.output_dimension = kargs['output_dimension']

    self.word_embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.word_embedding_dimension,
                                       padding_idx=0)
    self.conv = nn.Conv1d(in_channels=self.word_embedding_dimension, out_channels=self.conv_num_filters, kernel_size=self.conv_window_size)
    self.max_pool = nn.MaxPool1d(2, 1)
    self.dropout = nn.Dropout(self.dropout_rate)
    self.fc1 = nn.Linear(300*8*2, self.sent_embedding_dimension)
    self.fc2 = nn.Linear(self.sent_embedding_dimension, self.output_dimension)

  def forward(self, x):
    x1, x2 = x
    x1, x2 = self.word_embedding(x1), self.word_embedding(x2)
    x1, x2 = torch.transpose(x1, 1, 2), torch.transpose(x1, 1, 2)
    x1, x2 = F.relu(self.conv(x1)), F.relu(self.conv(x2))
    x1, x2 = self.max_pool(x1), self.max_pool(x2)
    x1, x2 = self.dropout(x1), self.dropout(x2)
    x = torch.cat([x1, x2], dim=-1)
    x = x.view(x.shape[0], -1)
    x = self.dropout(x)
    x = self.fc1(x)
    x = self.dropout(x)
    x = self.fc2(x)
    return torch.sigmoid(x)

In [12]:
CNNSimilarity(**kargs)((q1_train[:3], q2_train[:3]))

tensor([[0.4398],
        [0.5243],
        [0.5424]], grad_fn=<SigmoidBackward>)

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [0]:
q1_val, q2_val, labels_val = q1_val.to(device), q2_val.to(device), labels_val.to(device)

In [0]:
model = CNNSimilarity(**kargs).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss().to(device)

In [16]:
model

CNNSimilarity(
  (word_embedding): Embedding(76529, 100, padding_idx=0)
  (conv): Conv1d(100, 300, kernel_size=(3,), stride=(1,))
  (max_pool): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=4800, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [0]:
def train(train_loader=train_loader, model=model, optimizer=optimizer, num_epochs=NUM_EPOCHS):
  model.train()

  for epoch in range(1, NUM_EPOCHS+1):
    for q1, q2, labels in train_loader:
      q1, q2, labels = q1.to(device), q2.to(device), labels.to(device)
      y_pred = model((q1, q2)).view_as(labels)
      loss = criterion(y_pred, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    if epoch % 5 == 0:
      print(f"Epoch: {epoch}, Train Loss: {loss.item()}")

In [18]:
train()

Epoch: 5, Train Loss: 0.484086811542511
Epoch: 10, Train Loss: 0.40599653124809265
Epoch: 15, Train Loss: 0.24335156381130219
Epoch: 20, Train Loss: 0.2927812337875366
Epoch: 25, Train Loss: 0.2018425017595291
Epoch: 30, Train Loss: 0.1750444620847702
Epoch: 35, Train Loss: 0.17149774730205536
Epoch: 40, Train Loss: 0.13104240596294403
Epoch: 45, Train Loss: 0.1063234955072403
Epoch: 50, Train Loss: 0.1349501609802246
Epoch: 55, Train Loss: 0.12976115942001343
Epoch: 60, Train Loss: 0.11855975538492203
Epoch: 65, Train Loss: 0.1282782405614853
Epoch: 70, Train Loss: 0.09749212861061096
Epoch: 75, Train Loss: 0.1278562992811203
Epoch: 80, Train Loss: 0.11569090932607651
Epoch: 85, Train Loss: 0.11369975656270981
Epoch: 90, Train Loss: 0.1340484768152237
Epoch: 95, Train Loss: 0.06406896561384201
Epoch: 100, Train Loss: 0.12295356392860413


In [0]:
val_dataset = ValData()
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [0]:
def eval():
  model.eval()
  total_loss = 0
  acc = 0

  with torch.no_grad():
    for q1, q2, labels in val_loader:
      q1, q2, labels = q1.to(device), q2.to(device), labels.to(device)
      output = model((q2, q2)).view_as(labels)
      loss = criterion(output, labels) * BATCH_SIZE
      total_loss += loss.item()
      pred = (output.data >= 0.5).float()
      acc += (pred == labels).sum()

  return total_loss / len(val_dataset), acc.item() / len(val_dataset)

In [21]:
eval()

(2.1404534317766433, 0.6496164539577262)

In [0]:
torch.save(model.state_dict(), './CNN_for_TextSimilarity.pth')

## 테스트 파일 제출

In [0]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

In [24]:
new_model = CNNSimilarity(**kargs)
new_model.load_state_dict(torch.load('./CNN_for_TextSimilarity.pth'))

<All keys matched successfully>

In [0]:
predictions = new_model((torch.LongTensor(test_q1_data), torch.LongTensor(test_q2_data))).view(-1, 1) >= 0.5

In [0]:
predictions = np.array(predictions)

In [0]:
output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
output.to_csv("cnn_predict.csv", index=False, quoting=3)