In [2]:
import os, re
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from glob import glob
from natsort import natsorted
from konlpy.tag import Mecab

mecab = Mecab('C:\mecab\mecab-ko-dic')

In [3]:
import sys
sys.path.append('../')

In [4]:
# 2017년 NTIS 데이터 중 len(x)>256 이상만 학습에 사용

path = r'D:\data\ICT 트렌드 분석 DATA\NTIS'

files = glob(os.path.join(path, '*.xlsx'))

# 오래걸림
data = pd.read_excel(files[1], engine='openpyxl')

data = data.fillna('')

len_cut = 256

data = data[data['요약문_연구목표'].apply(lambda x: len(x) > len_cut)]

# sampled_data = data.sample(1000)

sample_list = data['요약문_연구목표'].to_list()

In [5]:
def normalize_text(text):
    doublespace_pattern = re.compile('\s+')
    text = str(text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [6]:
stopwords = ['개발','기술','분석','연구','시스템','목표','방법','기반','가능','필요','확보','이용','특성','변화','사용','기관','시험','기능','기존','영향','기법','과제','활용','적용']

In [7]:
train_data = []
for sample in tqdm(sample_list):
    a = normalize_text(sample)
    noun_list = [noun for noun in mecab.nouns(a) if len(noun)>1 and noun not in stopwords]
    train_data.append(noun_list)

100%|██████████████████████████████████████████████████████████████████████████| 28961/28961 [00:21<00:00, 1344.04it/s]


In [8]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel



In [9]:
dictionary = corpora.Dictionary(train_data)
corpus = [dictionary.doc2bow(text) for text in train_data]
num_topics = 20
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

In [10]:
topics = ldamodel.print_topics(num_words=10)

In [11]:
for topic in topics:
    print(topic)

(0, '0.031*"나노" + 0.027*"소재" + 0.023*"공정" + 0.015*"구조" + 0.014*"제조" + 0.012*"합성" + 0.012*"효율" + 0.011*"화학" + 0.011*"표면" + 0.011*"입자"')
(1, '0.043*"생산" + 0.027*"식품" + 0.022*"소재" + 0.018*"생물" + 0.015*"산업" + 0.013*"바이오" + 0.012*"품종" + 0.012*"자원" + 0.010*"국내" + 0.010*"활성"')
(2, '0.027*"서비스" + 0.025*"데이터" + 0.019*"정보" + 0.019*"관리" + 0.016*"지원" + 0.016*"구축" + 0.011*"제공" + 0.010*"플랫" + 0.010*"운영" + 0.009*"기업"')
(3, '0.025*"안전" + 0.022*"해양" + 0.022*"환경" + 0.015*"평가" + 0.013*"발생" + 0.013*"관리" + 0.010*"수산" + 0.010*"위험" + 0.009*"재난" + 0.009*"조사"')
(4, '0.032*"센서" + 0.019*"모델" + 0.019*"해석" + 0.013*"근대" + 0.013*"측정" + 0.010*"검증" + 0.009*"실험" + 0.009*"예측" + 0.009*"데이터" + 0.009*"결과"')
(5, '0.024*"설계" + 0.022*"구현" + 0.020*"제어" + 0.019*"제작" + 0.019*"모듈" + 0.017*"센서" + 0.017*"전력" + 0.015*"소자" + 0.015*"측정" + 0.014*"전기"')
(6, '0.023*"참여" + 0.023*"평가" + 0.016*"성능" + 0.016*"이상" + 0.015*"설계" + 0.013*"에너지" + 0.013*"제작" + 0.012*"모듈" + 0.011*"공정" + 0.011*"개선"')
(7, '0.054*"설계" + 0.047*"평가" + 0.026*"성능" + 0.018*

In [12]:
train_docs = [' '.join(i) for i in train_data]

In [13]:
sample_list = [normalize_text(doc) for doc in sample_list]

# transformer

In [14]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from topic_transformer import TopicTransformer, datasets

import warnings
warnings.filterwarnings('ignore')

In [None]:
epochs = 1
batch_size = 16
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# dataloader
dataset = datasets.SentenceLabelDataset(sample_list, corpus, ldamodel)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

# topictransformer
model = TopicTransformer(output_dim = 20,
                         transformer_model_name = 'xlm-roberta-base')

# optimizer, scheduler
optimizer = Adam(model.parameters(), lr=1e-4)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch:0.95**epoch)

model.to(device)
model.train()

# train
global_step = 0
total_loss = 0
for epoch in trange(epochs, desc="Epoch"):
    training_steps = 0
    for batch in dataloader:
        optimizer.zero_grad()
        features, labels = batch
        labels = labels.to(device)

        loss_value = model.loss(model(list(features)), labels)
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        
        total_loss += loss_value.item()
    total_loss /= len(dataloader)
    print('Epoch {} total loss : {:.5f}'.format(epoch, total_loss))
    scheduler.step()

    training_steps += 1
    global_step += 1

    print('================================')

# evaluation

In [23]:
class TestDataset(Dataset):
    def __init__(self, test_docs, num_topics=15):
        self.test_docs = test_docs
        self.num_topics = num_topics

    def __len__(self):
        return len(self.test_docs)

    def __getitem__(self, idx):
        doc = self.test_docs[idx]

        return doc, _

In [24]:
scopus_path = r'D:\downloads\ICT 트렌드 분석 DATA_2\SCOPUS'
scopus_data = pd.read_csv(os.path.join(scopus_path, 'SCOPUS ABSTRACT(2017).csv'), encoding='ISO-8859-1')

In [25]:
test_list = []
for i,a in scopus_data.iterrows():
    test_list.append(normalize_text(a['abstract']))

In [None]:
model.to(device)
model.eval()
with torch.no_grad():
    pred = model(list(test_list[20]), device=device)

In [None]:
test_batch_size = 1
test_dataset = TestDataset(test_list, num_topics=20)
test_dataloader = DataLoader(test_dataset, batch_size = test_batch_size, shuffle=False)

preds = []
model.eval()
model.to(device)
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        features, _ = batch
        pred = model(list(features))
        preds.append(pred)

# model save

In [None]:
torch.save(model.state_dict(), 'topictransformer.pt')

model.load_state_dict(torch.load('topictransformer.pt'))