In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

In [2]:
%env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

env: JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"


In [4]:
!pip install sentence-transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 24.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 65.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.5 MB/s 
Building wheels for collected 

In [5]:
from glob import glob
import re

import numpy as np
import pandas as pd
import itertools

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import torch

In [6]:
def load_data(file, sheet_name):
    return pd.read_excel(file, sheet_name=sheet_name)

In [14]:
def load_all_data():
    files = glob('/content/drive/MyDrive/aida/dataset/임대차3법(54,752건)/*.xlsx')
    df = pd.DataFrame()
    for file in files:
        df = df.append(load_data(file, '뉴스'))
    return df

news = load_all_data()

In [15]:
news.shape

(17514, 4)

In [16]:
def text_cleaning(x):
    mail_del = re.sub("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z-.]+)", "", str(x))
    meta_del = re.sub("[\r\n\xa0]", "", str(mail_del))
    name_del = re.sub("(\.\s+[ㄱ-ㅎ가-힣]+\s[기]+[자]+)", "", str(meta_del))
    clean_text = re.sub("[^\w\s^.]", " ", name_del)
    
    return clean_text

In [17]:
news.columns = news.iloc[0]
news.drop(0, inplace=True, axis=0)
news = news.dropna(axis=0, how='any')

news['내용'] = news['내용'].map(text_cleaning)

print(news.shape)
news.head()

(17497, 4)


Unnamed: 0,작성일,작성자,제목,내용
1,2020/11/17,MBC연예,"'100분토론' 서민 주거 안정 위한 '임대차 3법', 10명 중 6명은 모른다?",오늘 17일 방송되는 MBC 100분 토론 에서는 전세난 해법은 이라는 주제...
2,2020/08/12,오마이뉴스,임대차3법 탓에 10억→14억?... 언론은 어떻게 왜곡하나,보도 검증 기사에 거론된 아파트들 찾아가보니... 원래 14억대 거래... 10...
3,2020/07/29,연합뉴스,"""임대차 3법, 주거안정 기여 기대""…매물잠김 우려도(종합)",전문가들 긍정 평가속 시행 초기 부작용 우려 의사봉 두드리는 윤호중 법사위원장 ...
4,2020/07/29,연합뉴스,"'전세난민' 사라지나…전문가 ""임대차3법, 주거안정에 도움""",전세매물 잠김현상 4년 주기 전셋값 폭등 우려도 의사봉 두드리는 윤호중 법사위원장 ...
5,2020/11/16,한겨레,"[안재승 칼럼] ‘동네북’ 임대차 3법, 더 강력해져야 한다",임대차 3법이 동네북 신세다. 보수 야당과 언론이 임대차 3법이 전세난을 불렀다고 ...


In [18]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [19]:
# 이미 fine-tuning되어있는 open-source model이 있어 그걸 사용, cost 낭비X
class mySBERT(SentenceTransformer):
    def __init__(self, path, modules=None):
        super().__init__(path, modules)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = self._first_module().tokenizer

    def encode(self, sentences, batch_size=32, show_progress_bar=None, output_value='token_embeddings', convert_to_numpy=True, convert_to_tensor=False, is_pretokenized=False):
        return super().encode(sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, is_pretokenized)

    def tokenize(self, sentences):
        return self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    
    def to(self, device):
        self.device = device
        return self

In [21]:
okt = Okt()
model = SentenceTransformer('jhgan/ko-sbert-multitask')
print('nr_candidates=10 : ')

for i in range(10):
    tokenized_doc = okt.pos(news.iloc[i]['내용'], norm=True, stem=True)
    tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

    n_gram_range = (1,2)

    count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
    candidates = count.get_feature_names_out()

    candidate_embeddings = model.encode(candidates, convert_to_tensor=True)
    doc_embedding = model.encode([tokenized_nouns], convert_to_tensor=True)
    
    keywords = max_sum_sim(doc_embedding.cpu(), candidate_embeddings.cpu(), candidates, top_n=5, nr_candidates=10)
    
    print(f'{i}th instance')
    print(keywords)
    print('---------------------------------------------------')

nr_candidates=10 : 
0th instance
['임대차 전월세', '이후 전월세', '분석 전월세', '비율 전월세', '토론 전월세']
---------------------------------------------------
1th instance
['임대차 때문', '시세 계약', '전월세 시행', '전세 시세', '임대차 부작용']
---------------------------------------------------
2th instance
['주택임대차보호법 개정', '시행 월세', '전망 전월세', '지적 주택임대차보호법', '임대차 개정안']
---------------------------------------------------
3th instance
['고려 전월세', '전세 매물', '임대차 개정안', '부담 전세', '통과 주택임대차보호법']
---------------------------------------------------
4th instance
['임대차 동네', '철렁 집주인', '청구권 전월세', '집주인 반발', '집주인 권리']
---------------------------------------------------
5th instance
['상률 주택임대차보호법', '임대료 인상', '임대료 폭등', '주장 주택임대차보호법', '권제 전월세']
---------------------------------------------------
6th instance
['청구권 전월세', '전월세 고제', '제한 집주인', '전셋집 워낙', '집주인 입자']
---------------------------------------------------
7th instance
['임대차 셋값', '전월세 거래', '아파트 셋값', '전셋집 자금', '전세 보증금']
---------------------------------------------------
8th instance
['임대차 소급', '세