In [1]:
import os, re
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from glob import glob
from natsort import natsorted
from konlpy.tag import Mecab

mecab = Mecab('C:\mecab\mecab-ko-dic')

In [2]:
import sys
sys.path.append('../')

In [3]:
# 2017년 NTIS 데이터 중 len(x)>256 이상만 학습에 사용

path = r'D:\data\ICT 트렌드 분석 DATA\NTIS'

files = glob(os.path.join(path, '*.xlsx'))

# 오래걸림
data = pd.read_excel(files[1], engine='openpyxl')

data = data.fillna('')

len_cut = 256

data = data[data['요약문_연구목표'].apply(lambda x: len(x) > len_cut)]

# sampled_data = data.sample(1000)

sample_list = data['요약문_연구목표'].to_list()

In [4]:
def normalize_text(text):
    doublespace_pattern = re.compile('\s+')
    text = str(text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

In [5]:
stopwords = ['개발','기술','분석','연구','시스템','목표','방법','기반','가능','필요','확보','이용','특성','변화','사용','기관','시험','기능','기존','영향','기법','과제','활용','적용']

In [6]:
train_data = []
for sample in tqdm(sample_list):
    a = normalize_text(sample)
    noun_list = [noun for noun in mecab.nouns(a) if len(noun)>1 and noun not in stopwords]
    train_data.append(noun_list)

100%|██████████████████████████████████████████████████████████████████████████| 28961/28961 [00:21<00:00, 1327.71it/s]


In [7]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel



In [8]:
dictionary = corpora.Dictionary(train_data)
corpus = [dictionary.doc2bow(text) for text in train_data]
num_topics = 20
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

In [9]:
topics = ldamodel.print_topics(num_words=10)

In [10]:
for topic in topics:
    print(topic)

(0, '0.023*"장치" + 0.021*"차량" + 0.016*"전기" + 0.013*"자동" + 0.012*"제어" + 0.012*"배터리" + 0.012*"자동차" + 0.011*"발생" + 0.011*"모터" + 0.010*"개선"')
(1, '0.039*"공정" + 0.025*"소재" + 0.022*"제조" + 0.013*"코팅" + 0.013*"제품" + 0.012*"평가" + 0.011*"표면" + 0.011*"이상" + 0.010*"제작" + 0.009*"최적화"')
(2, '0.027*"생산" + 0.024*"소재" + 0.017*"식품" + 0.017*"물질" + 0.013*"반응" + 0.011*"생물" + 0.011*"화학" + 0.011*"품종" + 0.010*"활성" + 0.010*"촉매"')
(3, '0.040*"전지" + 0.031*"태양" + 0.023*"특허" + 0.021*"방송" + 0.016*"제작" + 0.015*"효율" + 0.014*"기기" + 0.013*"에너지" + 0.013*"게임" + 0.012*"웨어"')
(4, '0.030*"서비스" + 0.029*"정보" + 0.015*"영상" + 0.015*"데이터" + 0.012*"제공" + 0.011*"플랫" + 0.009*"통신" + 0.009*"콘텐츠" + 0.009*"관리" + 0.008*"환경"')
(5, '0.034*"공간" + 0.030*"이론" + 0.019*"문제" + 0.011*"함수" + 0.010*"차원" + 0.010*"무인" + 0.010*"수학" + 0.008*"일반" + 0.008*"결과" + 0.007*"항공기"')
(6, '0.044*"해양" + 0.017*"처리" + 0.016*"비용" + 0.013*"오염" + 0.013*"시설" + 0.012*"배출" + 0.012*"회수" + 0.012*"공급" + 0.011*"생산" + 0.011*"운전"')
(7, '0.049*"데이터" + 0.026*"에너지" + 0.017*"관리" + 0

In [11]:
train_docs = [' '.join(i) for i in train_data]

In [12]:
sample_list = [normalize_text(doc) for doc in sample_list]

# transformer

In [13]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from topic_transformer import TopicTransformer, datasets

import warnings
warnings.filterwarnings('ignore')

In [None]:
epochs = 1
batch_size = 16
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# dataloader
dataset = datasets.SentenceLabelDataset(sample_list, corpus, ldamodel)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

# topictransformer
model = TopicTransformer(output_dim = 20,
                         transformer_model_name = 'xlm-roberta-base')

# optimizer, scheduler
optimizer = Adam(model.parameters(), lr=1e-4)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch:0.95**epoch)

model.to(device)
model.train()

# train
global_step = 0
total_loss = 0
for epoch in trange(epochs, desc="Epoch"):
    training_steps = 0
    for batch in dataloader:
        optimizer.zero_grad()
        features, labels = batch
        labels = labels.to(device)

        loss_value = model.loss(model(list(features)), labels)
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        
        total_loss += loss_value.item()
    total_loss /= len(dataloader)
    print('Epoch {} total loss : {:.5f}'.format(epoch, total_loss))
    scheduler.step()

    training_steps += 1
    global_step += 1

    print('================================')

# evaluation

In [23]:
class TestDataset(Dataset):
    def __init__(self, test_docs, num_topics=15):
        self.test_docs = test_docs
        self.num_topics = num_topics

    def __len__(self):
        return len(self.test_docs)

    def __getitem__(self, idx):
        doc = self.test_docs[idx]

        return doc, _

In [24]:
scopus_path = r'D:\downloads\ICT 트렌드 분석 DATA_2\SCOPUS'
scopus_data = pd.read_csv(os.path.join(scopus_path, 'SCOPUS ABSTRACT(2017).csv'), encoding='ISO-8859-1')

In [25]:
test_list = []
for i,a in scopus_data.iterrows():
    test_list.append(normalize_text(a['abstract']))

In [None]:
model.to(device)
model.eval()
with torch.no_grad():
    pred = model(list(test_list[20]), device=device)

In [None]:
test_batch_size = 1
test_dataset = TestDataset(test_list, num_topics=20)
test_dataloader = DataLoader(test_dataset, batch_size = test_batch_size, shuffle=False)

preds = []
model.eval()
model.to(device)
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        features, _ = batch
        pred = model(list(features))
        preds.append(pred)