In [14]:
import pickle
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from kiwipiepy import Kiwi
from typing import Tuple
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
kiwi = Kiwi()

# Helpers

In [3]:
def extract_words_with_target_tags(sentence: str, tags: Tuple[str] = ('NNG', 'NNP')):
    parsed_sentence = kiwi.analyze(sentence)[0][0]
    
    target_tokens = [token.form for token in parsed_sentence if token.tag in tags]
    
    return target_tokens

In [36]:
class UnionFind:
    def __init__(self, n):
        self.parent = [i for i in range(n)]
        self.rank = [0] * n

    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)

        if root_x != root_y:
            if self.rank[root_x] < self.rank[root_y]:
                self.parent[root_x] = root_y
            elif self.rank[root_x] > self.rank[root_y]:
                self.parent[root_y] = root_x
            else:
                self.parent[root_x] = root_y
                self.rank[root_y] += 1

# Data Load

In [4]:
data = {key: value for key, value in pd.read_pickle('./reports_1_3.pkl').get('q1').items() if value.get('company_info')}

## Preprocess

In [74]:
for key, value in tqdm(data.items()):
    value.update({
        'company_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('company_info')).text),
        'business_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('business_info')).text),
    })

100%|██████████| 790/790 [00:07<00:00, 98.83it/s] 


In [75]:
for key, value in tqdm(data.items()):
    value.update({
        'company_info': extract_words_with_target_tags(value.get('company_info')),
        'business_info': extract_words_with_target_tags(value.get('business_info')),
    })

100%|██████████| 790/790 [01:24<00:00,  9.31it/s]


In [79]:
# with open('./reports_1_3_tokenized.pkl', 'wb') as file:
#     pickle.dump(data, file)

In [6]:
company_info = [' '.join(value.get('company_info')) for value in data.values()]
business_info = [' '.join(value.get('business_info')) for value in data.values()]

In [65]:
company_names = pd.DataFrame([value.get('company_name (kor)') for value in data.values()], columns=['company_name'])

# Model

## TF-IDF

### company info

In [75]:
THRESHOLD = 0.6

company_info_by_vectorized = TfidfVectorizer().fit_transform(company_info).toarray()
company_info_similarity = np.tril(cosine_similarity(company_info_by_vectorized))
np.fill_diagonal(company_info_similarity, 0)
similar_company_info_index = np.where(company_info_similarity > THRESHOLD)

union_find = UnionFind(len(business_info))
for x, y in zip(*similar_company_info_index):
    union_find.union(x, y)
    
groups = {i: [] for i in range(len(business_info))}
for i, parent in enumerate(union_find.parent):
    groups[parent].append(i)

In [76]:
groups

{0: [],
 1: [1],
 2: [2, 3],
 3: [],
 4: [4, 657],
 5: [5],
 6: [6],
 7: [],
 8: [8, 549],
 9: [9],
 10: [157],
 11: [],
 12: [173, 238, 261, 346, 411, 671],
 13: [13],
 14: [14],
 15: [],
 16: [16],
 17: [0,
  7,
  10,
  15,
  17,
  20,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  33,
  36,
  48,
  53,
  54,
  55,
  56,
  59,
  62,
  70,
  75,
  78,
  79,
  80,
  81,
  86,
  89,
  91,
  96,
  100,
  104,
  105,
  116,
  119,
  121,
  122,
  129,
  132,
  138,
  147,
  148,
  155,
  161,
  162,
  163,
  164,
  166,
  168,
  170,
  171,
  174,
  186,
  188,
  192,
  193,
  197,
  202,
  203,
  204,
  206,
  210,
  211,
  219,
  222,
  223,
  228,
  229,
  230,
  231,
  236,
  242,
  244,
  246,
  248,
  249,
  250,
  251,
  252,
  253,
  268,
  278,
  285,
  288,
  293,
  295,
  296,
  299,
  301,
  302,
  303,
  310,
  311,
  313,
  331,
  344,
  351,
  360,
  361,
  362,
  363,
  364,
  367,
  374,
  376,
  382,
  386,
  387,
  399,
  401,
  402,
  403,
  404,
  405,
  408,
  413

In [77]:
company_names.iloc[groups.get(12)]

Unnamed: 0,company_name
173,대영포장보통주
238,디와이보통주
261,모두투어리츠보통주
346,서울식품공업보통주
411,아세아제지보통주
671,한국수출포장공업보통주


### business info

In [69]:
THRESHOLD = 0.4

business_info_by_vectorized = TfidfVectorizer().fit_transform(business_info).toarray()
business_info_similarity = np.tril(cosine_similarity(business_info_by_vectorized))
np.fill_diagonal(business_info_similarity, 0)
similar_business_info_index = np.where(business_info_similarity > THRESHOLD)

union_find = UnionFind(len(business_info))
for x, y in zip(*similar_business_info_index):
    union_find.union(x, y)
    
groups = {i: [] for i in range(len(business_info))}
for i, parent in enumerate(union_find.parent):
    groups[parent].append(i)

In [71]:
groups

{0: [0, 242, 430, 613],
 1: [420],
 2: [2, 3],
 3: [],
 4: [10, 33, 376, 455, 498, 499, 560],
 5: [5],
 6: [],
 7: [7],
 8: [8, 549],
 9: [9],
 10: [],
 11: [],
 12: [119, 678],
 13: [],
 14: [14, 418],
 15: [25, 155, 326],
 16: [17, 23, 26, 28, 47, 80, 261, 438, 592, 675, 688],
 17: [],
 18: [18],
 19: [116, 225, 363],
 20: [20],
 21: [21, 22],
 22: [],
 23: [],
 24: [4,
  6,
  11,
  12,
  13,
  15,
  16,
  24,
  37,
  38,
  41,
  45,
  49,
  52,
  57,
  62,
  70,
  82,
  88,
  95,
  96,
  112,
  114,
  125,
  154,
  156,
  157,
  174,
  190,
  192,
  198,
  204,
  219,
  243,
  244,
  249,
  251,
  255,
  263,
  264,
  267,
  279,
  304,
  305,
  306,
  311,
  313,
  314,
  316,
  338,
  351,
  358,
  360,
  364,
  386,
  389,
  395,
  402,
  404,
  414,
  441,
  442,
  443,
  445,
  446,
  447,
  448,
  453,
  460,
  477,
  478,
  492,
  494,
  495,
  500,
  519,
  535,
  562,
  563,
  565,
  569,
  573,
  590,
  601,
  611,
  615,
  618,
  625,
  633,
  640,
  645,
  657,
  663,
  

In [72]:
company_names.iloc[groups.get(16)]

Unnamed: 0,company_name
17,DL이앤씨보통주
23,GS건설보통주
26,HDC현대산업개발보통주
28,HL D&I보통주
47,LF보통주
80,SK디앤디보통주
261,모두투어리츠보통주
438,에이자기관리부동산투자회사보통주
592,케이탑리츠보통주
675,한국자산신탁보통주
