In [None]:
!pip install krwordrank
!pip install soynlp
!pip install konlpy
!pip install koalanlp

In [None]:
from krwordrank.hangle import normalize
from krwordrank.word import KRWordRank

from soynlp.tokenizer import NounLMatchTokenizer
from soynlp.noun import LRNounExtractor_v2

from gensim.models import FastText

from itertools import chain

from koalanlp import API
from koalanlp.proc import SentenceSplitter
from koalanlp.Util import initialize, finalize

import re
import joblib
import pickle
import json


# 예시 데이터 로드
with open('/content/drive/MyDrive/2023_1 KPMG Ideathon/retrieved_article.bin', 'rb') as f:
    data = pickle.load(f, encoding='utf-8')

data = data[0]
query, texts = data['question'], data['text']

print("Query ", query)


# koalanlp : 문장 분리
initialize(HNN='2.0.3')
splitter = SentenceSplitter(API.HNN)

retrieved_texts = list(chain.from_iterable([splitter(text) for text in texts]))

finalize()


# KR-WordRank
wordrank_extractor = KRWordRank(min_count=5, max_length=100, verbose=False)


# Soynlp
noun_extractor = LRNounExtractor_v2(verbose=False)


    
# 정규화된 문장 리스트
texts = [normalize(text, english=True, number=True) for text in retrieved_texts]

# KR-WordRank 사용
keywords, rank, graph = wordrank_extractor.extract(texts, beta=0.85, max_iter=10)

# Soynlp 사용
nouns = noun_extractor.train_extract(texts)

# KR-WordRank 사용한 키워드 사전 ∩ Soynlp 사용한 명사 사전
nouns_keywords = {keyword : keywords[keyword] for keyword in keywords if keyword in nouns.keys()}

# query에 포함된 keywords
query_keywords = [keyword for keyword in nouns_keywords.keys() if keyword in re.sub(r'\s', '', query)]

# 특정 단어와 관련된 명사 키워드 추출
noun_extractor = NounLMatchTokenizer(nouns_keywords)
fastText_corpus = [noun_extractor.tokenize(sent) for sent in texts]
fastText = FastText(fastText_corpus, size=100, window=5, workers=10)


output_path = "./keyword_outputs.json"
keyword_outputs = [{"keyword": keyword, "similar_keywords": dict(fastText.wv.most_similar(keyword, topn=5))} for keyword in query_keywords]

with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(keyword_outputs, file, ensure_ascii=False, indent=4)