In [32]:
# mount google drive 

import os, sys 
from google.colab import drive

drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/가사유사도기반추천

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/가사유사도기반추천


In [33]:
# ###0. BERT 설치

# 필요 패키지 설치
get_ipython().system('pip install transformers')



In [34]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertTokenizer
from transformers import BertForPreTraining, BertConfig, BertModel

from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
import time
import datetime
import os

In [35]:
# Check GPU
print("GPU : ",torch.cuda.get_device_name(0))

# Set GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

GPU :  Tesla P100-PCIE-16GB


In [36]:
# ### 1. 인풋 파이프라인 세팅

# csv 데이터셋 파일 경로
PATH = './data/remove_meaningless.csv'

# 판다스로 dataset 불러오기
dataset = pd.read_csv(PATH)


# 최대 입력 시퀀스 길이 설정
MAX_LEN = 512
# 배치 사이즈 설정
batch_size = 64

In [37]:
def make_input_from_dataset(dataset, MAX_LEN=512, batch_size=64) :
  # [CLS] = 101, [SEP] = 102 in bert-base-multilingual-cased vocab
  # [CLS], [SEP] 토큰 앞뒤로 붙히기
  document_bert = ["[CLS] " + str(i) + " [SEP]" for i in dataset['lyrics']]

  # Multilingual BERT 토크나이저 소환
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

  # 토크나이징
  document_bert = [tokenizer.tokenize(j) for j in document_bert]

  # vocab index로 변환
  input_ids = [tokenizer.convert_tokens_to_ids(i) for i in document_bert]

  # 입력 시퀀스 만들기
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post",padding="post")

  # 어텐션 마스크 세팅
  attention_masks = []

  # Padding = 0, Not Padding = 1 / Not trained for Padding
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  # 파이토치 텐서로 변환
  dataset_inputs = torch.tensor(input_ids, dtype=torch.long)
  dataset_masks = torch.tensor(attention_masks, dtype=torch.long)

  # input_ids, attention masks 파이토치 데이터로더로 묶기
  final_data = TensorDataset(dataset_inputs, dataset_masks)
  final_dataloader = DataLoader(final_data, batch_size=batch_size)

  return final_dataloader


final_dataloader = make_input_from_dataset(dataset,MAX_LEN,batch_size)

KeyboardInterrupt: ignored

In [None]:
# ### 2. BERT

# bert-base-multilingual-cased 소환
model = BertModel.from_pretrained("bert-base-multilingual-cased")


# 시간 표시 함수
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


# 최종 vectors 배열로
final_vectors = np.zeros([1,768],dtype=float)

t0 = time.time()
model.eval()

# model gpu로 보내기
model.to(device)


In [None]:
# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(final_dataloader):
    # 경과 정보 표시
    if step % 20 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('Batch {:>5,} / {:>5,} - Time Running.. {:}.'.format(step, len(final_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask = batch
    
    with torch.no_grad():     
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # CPU로 데이터 이동
    vectors = outputs[1].detach().cpu().numpy()

    # 넘파이 배열로
    final_vectors = np.concatenate((final_vectors,vectors),axis=0)
    del vectors

print("Done.")

In [None]:
# 넘파이 배열 초기화 당시 첫행 제거
final_vectors = final_vectors[1:]


# 불러온 데이터셋이랑 최종 넘파이 배열 레코드 갯수 확인
try:
  if len(dataset) != len(final_vectors) : 
      raise Exception('Records of Dataset and Final vector shape is Not match.')
  print("최종 Context Vector Shape",final_vectors.shape)
except Exception as e:
  print('Exception is occured. ', e)

In [None]:
## 최종 넘파이 배열 저장
np.save('./vec/mul_vec_ts',final_vectors)

In [None]:
print(final_vectors)