In [1]:
import os, sys
import re
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from glob import glob
from konlpy.tag import Mecab
from gensim.models.ldamodel import LdaModel
from datetime import datetime
import warnings

sys.path.append('../')
mecab = Mecab('C:\mecab\mecab-ko-dic')
warnings.filterwarnings('ignore')



# Load Train Data

In [2]:
# 2017년 NTIS 데이터 중 len(x)>256 이상만 학습에 사용

path = r'D:\data\ICT 트렌드 분석 DATA\NTIS'

files = glob(os.path.join(path, '*.xlsx'))

# 오래걸림
for i,file in enumerate(files):
    data = pd.read_excel(file, engine='openpyxl')
    if i>=1:
        data2 = pd.read_excel(file, engine='openpyxl')
        data = pd.concat([data,data2])

data = data.fillna('')

len_cut = 256

data = data[data['요약문_연구내용'].apply(lambda x: len(x) > len_cut)]
doc_list = data['요약문_연구내용'].to_list()

In [3]:
def normalize_text(text):
    doublespace_pattern = re.compile('\s+')
    text = str(text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [4]:
ldamodel_path = r'D:\notebooks\kisdi\lda_100\ldamodel'
ldamodel = LdaModel.load(ldamodel_path)
train_data = [normalize_text(doc) for doc in doc_list]

# transformer

In [5]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from topic_transformer import TopicTransformer_TEHead, datasets
from tensorboardX import SummaryWriter

In [6]:
if not os.path.isdir('./log'):
    os.makedirs('./log')

In [7]:
epochs = 20
batch_size = 16
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# dataloader
dataset = datasets.SentenceLabelDataset(dic_path='C:\mecab\mecab-ko-dic',
                                        train_docs=train_data,
                                        lda_model=ldamodel,
                                        num_topics=100)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

In [8]:
# topictransformer
model = TopicTransformer_TEHead(output_dim = 100,
                             transformer_model_name='xlm-roberta-base')

# optimizer, scheduler
optimizer = Adam(model.parameters(), lr=1e-5)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch:0.95**epoch)

model.to(device)
model.train()

# freeze xlm-r layers
for param in model.parameters():
    param.requires_grad = False

# unfreeze only head layers (downstream mlp)
for name, param in model.named_parameters():
    if name.startswith('head_layers'):
        param.requires_grad = True

writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
# train
total_loss = 0
for epoch in trange(epochs, desc="Epoch"):
    for batch in dataloader:
        optimizer.zero_grad()
        features, labels = batch
        labels = labels.to(device)

        loss_value = model.loss(model(list(features),device='cuda:0',ptm_freeze=True), labels)
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()

        total_loss += loss_value.item()
    total_loss /= len(dataloader)
    writer.add_scalar("loss", total_loss, epoch)
    print('Epoch {} total loss : {:.5f}'.format(epoch, total_loss))
    scheduler.step()

    print('================================')

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch:   0%|          | 0/20 [00:00<?, ?it/s]


AttributeError: 'TopicTransformer_TEHead' object has no attribute 'loss'

# model save

In [10]:
torch.save(model.state_dict(), 'TT_MLP_epoch40.pt')

In [9]:

model.load_state_dict(torch.load('topictransformer.pt'))

RuntimeError: Error(s) in loading state_dict for TopicTransformer_MLP:
	Missing key(s) in state_dict: "head_layers.0.weight", "head_layers.0.bias", "head_layers.1.weight", "head_layers.1.bias", "head_layers.2.weight", "head_layers.2.bias", "head_layers.3.weight", "head_layers.3.bias". 
	Unexpected key(s) in state_dict: "lstm.weight_ih_l0", "lstm.weight_hh_l0", "lstm.bias_ih_l0", "lstm.bias_hh_l0", "lstm.weight_ih_l1", "lstm.weight_hh_l1", "lstm.bias_ih_l1", "lstm.bias_hh_l1", "fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias". 