In [1]:
import os, re
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from glob import glob
from natsort import natsorted
from konlpy.tag import Mecab
from gensim import corpora
from gensim.models.ldamodel import LdaModel

mecab = Mecab('C:\mecab\mecab-ko-dic')

# prepare data

In [2]:
# 2017년 NTIS 데이터 중 len(x)>256 이상만 학습에 사용

path = r'D:\data\ICT 트렌드 분석 DATA\NTIS'

files = glob(os.path.join(path, '*.xlsx'))

# 오래걸림
for i,file in enumerate(files):
    data = pd.read_excel(file, engine='openpyxl')
    if i>=1:
        data2 = pd.read_excel(file, engine='openpyxl')
        data = pd.concat([data,data2])

data = data.fillna('')

len_cut = 256

data = data[data['요약문_연구내용'].apply(lambda x: len(x) > len_cut)]

# sampled_data = data.sample(1000)

sample_list = data['요약문_연구내용'].to_list()

In [3]:
def normalize_text(text):
    doublespace_pattern = re.compile('\s+')
    text = str(text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [4]:
stopwords = []
train_data = []
for sample in tqdm(sample_list):
    a = normalize_text(sample)
    noun_list = [noun for noun in mecab.nouns(a) if len(noun)>1 and noun not in stopwords]
    train_data.append(noun_list)

100%|████████████████████████████████████████████████████████████████████████| 110308/110308 [01:49<00:00, 1011.71it/s]


# make LDA Model

In [7]:
dictionary = corpora.Dictionary(train_data)

dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in train_data]
num_topics = 100
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary,)

In [None]:
# save model

lda_save_path = r'D:\notebooks\kisdi\lda_100\ldamodel'

ldamodel.save(lda_save_path)

In [None]:
train_docs = [' '.join(i) for i in train_data]

sample_list = [normalize_text(doc) for doc in sample_list]

# transformer

In [9]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

In [10]:
class CustomDataset(Dataset):
    def __init__(self, train_docs, corpus, lda_model, num_topics=20):
        self.train_docs = train_docs
        self.num_topics = num_topics
        
        self.labels = []
        for doc in range(len(train_docs)):
            probabilities = [b for (a,b) in lda_model.get_document_topics(corpus[doc], minimum_probability=1e-5)]
            if len(probabilities)<self.num_topics:
                continue
            self.labels.append(probabilities)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        doc = self.train_docs[idx]
        label = torch.Tensor(self.labels[idx])

        return doc, label

In [13]:
class TopicTransformer(nn.Module):
    def __init__(self, output_dim, transformer_model=None, transformer_model_name=None, max_length=128):
        super(TopicTransformer, self).__init__()
        
        
        if transformer_model==None and transformer_model_name==None:
            print("ERROR : Cannot Load Transformer Model")
            return -1
        if transformer_model != None:
            self.tokenizer = AutoTokenizer.from_pretrained(transformer_model.config._name_or_path)
            self.transformer_model = transformer_model
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
            self.transformer_model = AutoModel.from_pretrained(transformer_model_name)
        
        self.hidden_dim = self.transformer_model.config.hidden_size
        self.output_dim = output_dim
        self.max_length = max_length
        
        self.head_layer = nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, input_x, device='cuda:0'):
        
        # Non-Tokenized Input
        if type(input_x) == list or type(input_x) == tuple :
            tokenized_sentence_list = self.tokenizer(input_x, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt').to(device)
        else:
            tokenized_sentence_list = input_x.to(device)
            
        # Transformer forward
        x = self.transformer_model(**tokenized_sentence_list).last_hidden_state
        
        # Avg Pooling
        pooling_mask = tokenized_sentence_list.attention_mask.unsqueeze(-1).expand(x.size()).float()
        sum_mask = pooling_mask.sum(1)
        x = (x*pooling_mask).sum(1) / sum_mask
        
        # Topic Head
        x = F.relu(x)
        x = self.head_layer(x)
        
        return F.softmax(x, dim=1)
    
    def loss(self, pred, label):
        return 100 * F.mse_loss(pred, label)
    
    

In [None]:
epochs = 20
batch_size = 16
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# dataloader
dataset = CustomDataset(sample_list, corpus, ldamodel, num_topics=100)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

# topictransformer
model = TopicTransformer(output_dim = 100,
                         transformer_model_name = 'xlm-roberta-base')

# optimizer, scheduler
optimizer = Adam(model.parameters(), lr=1e-5)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch:0.95**epoch)

model.to(device)
model.train()

# layer freezing
# for param in model.parameters():
#     param.requires_grad = False

# no_freeze = list(model.state_dict().keys())[-6:]

# for name, param in model.named_parameters():
#     if name in no_freeze:
#         param.requires_grad = True

# train
global_step = 0
total_loss = 0
for epoch in trange(epochs, desc="Epoch"):
    training_steps = 0
    for i,batch in enumerate(dataloader):
        optimizer.zero_grad()
        features, labels = batch
        labels = labels.to(device)

        loss_value = model.loss(model(list(features)), labels)
        if i==0:
            print("Epoch {} first loss value : {}".format(epoch, loss_value.item()))
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        
        total_loss += loss_value.item()
    total_loss /= len(dataloader)
    print('Epoch {} total loss : {:.5f}'.format(epoch, total_loss))
    scheduler.step()

    training_steps += 1
    global_step += 1

    print('================================')

In [24]:
# model save
torch.save(model.state_dict(), 'tt_20_100topics_freeze.pt')
# model.load_state_dict(torch.load('topictransformer.pt'))

# evaluation

In [27]:
class TestDataset(Dataset):
    def __init__(self, test_docs, num_topics=100):
        self.test_docs = test_docs
        self.num_topics = num_topics

    def __len__(self):
        return len(self.test_docs)

    def __getitem__(self, idx):
        doc = self.test_docs[idx]

        return doc, _

In [26]:
model = TopicTransformer(output_dim = 100,
                         transformer_model_name = 'xlm-roberta-base')
model.load_state_dict(torch.load('tt_20_100topics_freeze.pt'))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [28]:
# scopus evaluate
scopus_path = r'D:\data\ICT 트렌드 분석 DATA_2\SCOPUS'
scopus_data = pd.read_csv(os.path.join(scopus_path, 'SCOPUS ABSTRACT(2017).csv'), encoding='ISO-8859-1')

test_list = []
for i,a in scopus_data.iterrows():
    test_list.append(normalize_text(a['abstract']))

In [29]:
test_list[0]

'{¨Ï 2013 IEEE.This paper proposes a novel semiautomatic system domain data analysis method. The method is based on the iterative acquisition and analysis of a large body of bibliometric data, generation of domain taxonomies, and creation of domain models. The method was applied on a smart grid case study through collection and analysis of more than 6000 documents. We have found that our method produces domain models of comparable quality to the traditional manually produced domain models in a more cost-effective way.}}'

In [31]:
with torch.no_grad():
    pred = model(test_list[:16], device='cpu')
pred = pred.to('cpu')

In [32]:
for idx,i in enumerate(pred):
    print(f"{test_list[idx]}")
    print(f"argmax : {i.argmax().item()+1}, value : {i.max().item()}\n")
    print('================================')

{¨Ï 2013 IEEE.This paper proposes a novel semiautomatic system domain data analysis method. The method is based on the iterative acquisition and analysis of a large body of bibliometric data, generation of domain taxonomies, and creation of domain models. The method was applied on a smart grid case study through collection and analysis of more than 6000 documents. We have found that our method produces domain models of comparable quality to the traditional manually produced domain models in a more cost-effective way.}}
argmax : 10, value : 0.060139868408441544

{¨Ï 2013 IEEE.To minimize costs, a buying firm would seek sources which offer a more affordable price for the required products. On the basis of a principal-agent framework, this paper presents a buyer's supplier switching model under asymmetric information to minimize the buying cost considering the volume-dependent switching cost, the competitive reactions and economies of scale effects of the incumbent supplier. The proposed 

## lda infer

In [94]:
dictionary = dictionary.load(r'./ldamodel_100.id2word')

In [75]:
lda_inp = '''{IEEE 2013.노드 수가 많은 네트워크 시스템에서는 모든 네트워크 노드에 입력 신호를 적용하여 제어할 수 없다. 본 논문에서, 우리는 네트워크 시스템의 노드가 시스템의 몇 개의 노드에 의해 제어될 수 있도록 네트워크 토폴로지를 설계함으로써 이 문제를 해결할 수 있음을 보여준다. 네트워크 시스템의 구조적 제어 가능성과 토폴로지 설계 문제 사이의 기본적인 연결을 제공하는 이론적 프레임워크가 개발되었다. 결과는 또한 새로운 제어 노드를 도입하지 않고도 네트워크 시스템에 새로운 노드를 추가할 수 있는 방법을 조명한다. 따라서, 결과는 제어 가능한 네트워크를 얻기 위해 위상 설계를 다루는 데 유용하다. 또한 결과는 동일한 노드가 여러 개 있는 네트워크 시스템이 어떤 상황에서 제어 불가능한지도 보여준다. 많은 응용 프로그램에서 동일한 노드의 그룹이 서로 연결되며 이를 그룹 네트워크라고 합니다. 여기서는 네트워크 수준(즉, 그룹의 상호 연결)과 노드 수준(즉, 그룹 내 노드의 상호 연결) 모두에서 적절한 토폴로지 설계에 대한 정보를 제공하는 여러 네트워크 시스템 그룹에 대한 구조적 제어 가능성 문제를 해결한다.}}'''

In [39]:
nouns = [i for i in mecab.nouns(lda_inp) if len(i)>1]

tmp = dictionary.doc2bow(nouns)

print('\n'.join([str(a) for i,a in (ldamodel.get_document_topics(tmp, minimum_probability=1e-5))]))

0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.15137996
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.058793165
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.17392823
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.000107548265
0.15347907
0.02134233