In [1]:
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Fetched 261 kB in 2s (135 kB/s)
Reading package lis

In [2]:
%env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

env: JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"


In [3]:
!pip install sentence-transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from glob import glob
import re
import json
from tqdm import tqdm

import numpy as np
import pandas as pd
import itertools

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import torch


In [6]:
def load_data(file, sheet_name):
    return pd.read_excel(file, sheet_name=sheet_name)

In [7]:
def load_all_data(path):
    files = glob(path+'/*.xlsx')
    df = pd.DataFrame()
    for file in files:
        df = df.append(load_data(file, '뉴스'))
    return df

In [8]:
def text_cleaning(x):
    mail_del = re.sub("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z-.]+)", "", str(x))
    meta_del = re.sub("[\r\n\xa0]", "", str(mail_del))
    name_del = re.sub("(\.\s+[ㄱ-ㅎ가-힣]+\s[기]+[자]+)", "", str(meta_del))
    clean_text = re.sub("[^\w\s^.]", " ", name_del)
    
    return clean_text

In [9]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [10]:
class mySBERT(SentenceTransformer):
    def __init__(self, path, modules=None):
        super().__init__(path, modules)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = self._first_module().tokenizer

    def encode(self, sentences, batch_size=32, show_progress_bar=None, output_value='token_embeddings', convert_to_numpy=True, convert_to_tensor=False, is_pretokenized=False):
        return super().encode(sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, is_pretokenized)

    def tokenize(self, sentences):
        return self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    
    def to(self, device):
        self.device = device
        return self

In [12]:
for i in range(1,5):
    path = "/content/drive/MyDrive/aida/dataset"
    save_path = "/content/drive/MyDrive/aida"
    if i==1:
        path += "/임대차3법(54,752건)"
        save_path += "/rent.json"
    if i==2:
        path += "/중대재해처벌법(40,056건)"
        save_path += "/accident.json"
    if i==3:
        path += "/차별금지법(59,421건)" 
        save_path += "/discrimination.json"
    if i==4:
        path += "/탄소중립(59,295건)" 
        save_path += "/neutral.json"

    print(path)
    print(save_path)

    data = load_all_data(path)
    data.columns = data.iloc[0]
    data.drop(0, inplace=True, axis=0)
    data = data.dropna(axis=0, how='any')

    data['내용'] = data['내용'].map(text_cleaning)

    okt = Okt()
    model = SentenceTransformer('jhgan/ko-sbert-multitask')

    keywords = []
    np.random.seed(42)
    for i in tqdm(np.random.choice(data.index, 10000)):
        tokenized_doc = okt.pos(data.iloc[i]['내용'], norm=True, stem=True)
        tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])

        n_gram_range = (1,2)

        count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
        candidates = count.get_feature_names_out()

        candidate_embeddings = model.encode(candidates, convert_to_tensor=True)
        doc_embedding = model.encode([tokenized_nouns], convert_to_tensor=True)
        
        keywords.append({data.iloc[i]['작성일']:max_sum_sim(doc_embedding.cpu(), candidate_embeddings.cpu(), candidates, top_n=5, nr_candidates=10)})

    save_path = ""
    if i==1:
        save_path = "/content/drive/MyDrive/aida/rent.json"
    if i==2:
        save_path = "/content/drive/MyDrive/aida/accident.json"
    if i==3:
        save_path = "/content/drive/MyDrive/aida/discrimination.json" 
    if i==4:
        save_path = "/content/drive/MyDrive/aida/neutral.json" 
      
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(keywords, f, ensure_ascii=False, indent=4)

100%|██████████| 10000/10000 [57:40<00:00,  2.89it/s]


FileNotFoundError: ignored