In [54]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from kiwipiepy import Kiwi
from typing import Tuple
import gensim
from gensim import corpora
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import plotly.express as px
import pyLDAvis.gensim_models

In [31]:
data = pd.read_pickle('./data/reports_1_summarized_by_gpt_.pkl')

In [32]:
company_name_company = np.array([value.get('company_abbreviation_name (kor)') for key, value in data.items() if value.get('company_info_summarized')])
embedding_of_company_info_summarized = np.array([value.get('embedding_of_company_info_summarized') for key, value in data.items() if value.get('company_info_summarized')])
company_name_business = np.array([value.get('company_abbreviation_name (kor)') for key, value in data.items() if value.get('business_info_summarized')])
embedding_of_business_info_summarized = np.array([value.get('embedding_of_business_info_summarized') for key, value in data.items() if value.get('business_info_summarized')])

# Company Info

In [28]:
company_info = pd.concat([
    pd.DataFrame(company_name_company, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(embedding_of_company_info_summarized), columns=['x', 'y'])
], axis=1)

In [41]:
company_info.loc[company_info.company_name.str.contains('금융|은행|신한'), 'cluster'] = '금융'

In [43]:
px.scatter(
    company_info,
    x='x',
    y='y',
    color='cluster'
)

# Business Info

In [117]:
business_info = pd.concat([
    pd.DataFrame(company_name_business, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(embedding_of_business_info_summarized), columns=['x', 'y'])
], axis=1)

In [118]:
business_info.loc[business_info.company_name.str.contains('금융|은행|신한'), 'cluster'] = '금융'

In [119]:
business_info.loc[business_info.company_name.str.contains('삼성물산|현대건설|DL|GS건설|대우건설|SK에코|현대산업'), 'cluster'] = '건설'

In [120]:
business_info.loc[business_info.company_name.str.contains('증권'), 'cluster'] = '증권'

In [121]:
business_info.loc[business_info.company_name.str.contains('화학|케미컬|케미칼'), 'cluster'] = '화학'

In [122]:
business_info.loc[business_info.company_name.str.contains('홀딩스'), 'cluster'] = '지주사'

In [124]:
business_info.loc[business_info.company_name.str.contains('전기'), 'cluster'] = '전기'

In [107]:
business_info.to_csv('./names.csv', index=False)

In [137]:
stock_code = pd.DataFrame({key: [value.get('company_abbreviation_name (kor)')] for key, value in data.items() if value.get('business_info_summarized')}).T.reset_index().rename(columns={'index': 'stock_code', 0: 'company_name'})

In [143]:
stock_code.query('company_name=="삼성물산"')

Unnamed: 0,stock_code,company_name
297,28260,삼성물산


In [144]:
print(data.get('028260').get('business_info_summarized'))

The document provides an overview of the business sectors of the company, which include construction, resources development, steel, chemical, industrial materials, fashion, resort operations, and bio-pharmaceutical businesses. The document highlights key activities and projects in each sector, as well as the financial performance of each business segment. Additionally, it discusses the strategies and objectives for each sector, such as growth initiatives in the construction sector, international trade activities in the trading sector, expansion of brand presence in the fashion sector, development of lifestyle services in the resort sector, and advancements in bio-pharmaceutical production. Financial figures and percentages for each sector are also provided for the first quarter of the year 2022.


: 

In [125]:
px.scatter(
    business_info,
    x='x',
    y='y',
    color='cluster',
    hover_name='company_name'
)

## KMeans

### Reduction

In [34]:
business_info = pd.concat([
    pd.DataFrame(company_name_business, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(embedding_of_business_info_summarized), columns=['x', 'y'])
], axis=1)

kmeans = KMeans(n_clusters=50)
clusters = kmeans.fit(business_info.iloc[:, 1:]).labels_
business_info['cluster'] = clusters

In [20]:
px.scatter(
    business_info.astype({'cluster': 'str'}).sort_values('cluster'),
    x='x',
    y='y',
    color='cluster',
    hover_name='company_name'
)

In [None]:
business_info = pd.concat([
    pd.DataFrame(company_name_business, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(embedding_of_business_info_summarized), columns=['x', 'y'])
], axis=1)

In [35]:
for cluster in business_info.cluster.unique():
    print(business_info.query(f'cluster=={cluster}').company_name.values.reshape(-1))

['AJ네트웍스' 'KG모빌리티' 'LS ELECTRIC' 'SNT다이내믹스' 'SNT모티브' 'SNT홀딩스' '기아' '남선알미늄'
 '롯데렌탈' '성문전자' '신풍' 'HD한국조선해양' '우신시스템' '이아이디' '이엔플러스' '한국단자']
['AK홀딩스' 'SK디앤디' 'SK케미칼' '강남제비스코' '노루페인트' '노루홀딩스' '롯데케미칼' '미원상사' '삼영무역'
 '삼화페인트' '시디즈' '애경케미칼' 'SK디스커버리' 'OCI홀딩스' '조광페인트' '한화솔루션']
['BGF리테일' 'BGF']
['BNK금융지주' 'DGB금융지주' 'JB금융지주' 'KB금융' '다우기술' '신한지주' '오뚜기' '우리금융지주' '참엔지니어링'
 '카카오뱅크' '코오롱' '코오롱인더']
['BYC' 'HDC' 'HDC현대산업개발' '계룡건설' '대동' '디와이파워' '신세계건설' 'HD현대건설기계' 'HD현대'
 'HD현대인프라코어' '태영건설' '한국내화' '현대건설' '현대글로비스' '현대로템' '현대리바트' '현대엘리베이' '현대차'
 '현대지에프홀딩스' '현대코퍼레이션' '현대홈쇼핑' '화성산업' '효성']
['CJ' 'KH 필룩스' 'SPC삼립' '대한제당' '동서' '동화약품' '메타랩스' '보락' '사조동아원' '삼성물산'
 '성보화학' 'CJ제일제당' 'HLB글로벌' '일동홀딩스' 'KT&G' '풀무원' '현대퓨처넷']
['CJ씨푸드' '고려산업' '대상' '대한제분' '동원산업' '동원수산' '사조대림' '사조산업' '사조씨푸드' '샘표식품'
 '신라교역' '우성']
['CS홀딩스' 'DSR' 'DSR제강' 'NI스틸' '고려제강' '대한제강' '동일산업' '동일제강' '만호제강' '세아특수강'
 '영흥' '조선선재' '하이스틸' '한국주강' '한국주철관' '한국특강' '한일철강' '화인베스틸' '황금에스티']
['DB' 'NAVER' 'NHN' 'SK' '대교' '롯데정보통신' '비상교육' '신세계 I&C' '씨아이테크' '아시아나IDT'
 '웅진씽크

### Original

In [33]:
business_info = pd.concat([
    pd.DataFrame(company_name_business, columns=['company_name']),
], axis=1)

clusters = kmeans.fit(embedding_of_business_info_summarized).labels_
business_info['cluster'] = clusters


business_info = pd.concat([
    pd.DataFrame(company_name_business, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(embedding_of_business_info_summarized), columns=['x', 'y']),
    pd.DataFrame(clusters, columns=['cluster'])
], axis=1)

In [32]:
for cluster in business_info.cluster.unique():
    print(business_info.query(f'cluster=={cluster}').company_name.values.reshape(-1))

['AJ네트웍스' 'CS홀딩스' 'KCTC' 'LS ELECTRIC' 'LX인터내셔널' 'S-Oil' 'SGC에너지' 'STX'
 '경동나비엔' '대한해운' '동방' '세방' 'CJ대한통운' 'HD한국조선해양' 'HMM' '엔케이' '예스코홀딩스' '우진'
 '이아이디' '인터지스' '일진하이솔루스' 'GS글로벌' 'KC그린홀딩스' 'KSS해운' '팬오션' '포스코인터내셔널'
 'POSCO홀딩스' '한전기술' '한국주철관' '한국카본' '한익스프레스' '한전산업' '한창']
['AK홀딩스' 'KPX홀딩스' 'LS' 'SJM홀딩스' '녹십자홀딩스' '농심홀딩스' '대상홀딩스' '세아홀딩스' 'LX홀딩스'
 '종근당홀딩스' '진양홀딩스' '코스맥스비티아이' '크라운해태홀딩스' '티와이홀딩스' '평화홀딩스' '하이트진로홀딩스'
 '한국콜마홀딩스' '한세예스24홀딩스']
['BGF리테일' '광주신세계' 'GKL' '대한항공' '동양고속' '롯데관광개발' '세기상사' '아시아나항공' '에어부산'
 '제주항공' '진에어' '천일고속' '티웨이항공' '한국공항' '한국항공우주' '호텔신라']
['BGF' 'HL홀딩스' 'HS애드' 'SG글로벌' '극동유화' '금호타이어' '넥센' '넥센타이어' '대한화섬' '동성케미컬'
 '동아타이어' '미원화학' '미창석유' '삼성공조' '상신브레이크' '서연' '세방전지' '세원정공' '시디즈' 'CR홀딩스'
 '신도리코' '신풍' '엘브이엠씨홀딩스' '영보화학' '우진아이엔에스' '제일기획' '태광산업' '태원물산' '테이팩스'
 '한국앤컴퍼니' '한화솔루션' '화승코퍼레이션' '효성티앤씨']
['BNK금융지주' 'DGB금융지주' 'JB금융지주' '모두투어리츠' '삼성카드' '신한지주' '신흥' '아센디오' '참엔지니어링'
 '카카오뱅크' '케이탑리츠' '한국자산신탁' '한국전자홀딩스' '한국금융지주']
['BYC' 'SIMPAC' '경인전자' '대림통상' '동원시스템즈' '락앤락' '모나미' '벽산' '삼익악기' 'SUN&L'
 '

In [36]:
px.scatter(
    business_info.astype({'cluster': 'str'}).sort_values('cluster'),
    x='x',
    y='y',
    color='cluster',
    hover_name='company_name'
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## TF-IDF

In [41]:
kiwi = Kiwi()

In [42]:
def extract_words_with_target_tags(sentence: str, tags: Tuple[str] = ('NNG', 'NNP')):
    parsed_sentence = kiwi.analyze(sentence)[0][0]
    
    target_tokens = [token.form for token in parsed_sentence if token.tag in tags]
    
    return target_tokens

In [43]:
data = {key: value for key, value in pd.read_pickle('./data/reports_1_3.pkl').get('q1').items() if value.get('company_info')}

In [44]:
for key, value in tqdm(data.items()):
    value.update({
        'company_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('company_info')).text),
        'business_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('business_info')).text),
    })

for key, value in tqdm(data.items()):
    value.update({
        'company_info': extract_words_with_target_tags(value.get('company_info')),
        'business_info': extract_words_with_target_tags(value.get('business_info')),
    })

company_info = [' '.join(value.get('company_info')) for value in data.values()]
business_info = [' '.join(value.get('business_info')) for value in data.values()]

company_names = pd.DataFrame([value.get('company_name (kor)') for value in data.values()], columns=['company_name'])

100%|██████████| 790/790 [00:06<00:00, 119.88it/s]
100%|██████████| 790/790 [00:47<00:00, 16.65it/s]


In [18]:
# THRESHOLD = 0.4

business_info_by_vectorized = TfidfVectorizer().fit_transform(business_info).toarray()
# business_info_similarity = np.tril(cosine_similarity(business_info_by_vectorized))
# np.fill_diagonal(business_info_similarity, 0)
# similar_business_info_index = np.where(business_info_similarity > THRESHOLD)

In [22]:
business_info = pd.concat([
    pd.DataFrame(company_names, columns=['company_name']),
    pd.DataFrame(TSNE(n_components=2).fit_transform(business_info_by_vectorized), columns=['x', 'y'])
], axis=1)

kmeans = KMeans(n_clusters=50)
clusters = kmeans.fit(business_info.iloc[:, 1:]).labels_
business_info['cluster'] = clusters

In [23]:
px.scatter(
    business_info.astype({'cluster': 'str'}).sort_values('cluster'),
    x='x',
    y='y',
    color='cluster',
    hover_name='company_name'
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# Company Info

## LDA

In [None]:
kiwi = Kiwi()

In [None]:
def extract_words_with_target_tags(sentence: str, tags: Tuple[str] = ('NNG', 'NNP')):
    parsed_sentence = kiwi.analyze(sentence)[0][0]
    
    target_tokens = [token.form for token in parsed_sentence if token.tag in tags]
    
    return target_tokens

In [None]:
data = {key: value for key, value in pd.read_pickle('./data/reports_1_3.pkl').get('q1').items() if value.get('company_info')}

In [None]:
for key, value in tqdm(data.items()):
    value.update({
        'company_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('company_info')).text),
        'business_info': re.sub('\n{2,}', '', BeautifulSoup(value.get('business_info')).text),
    })

for key, value in tqdm(data.items()):
    value.update({
        'company_info': extract_words_with_target_tags(value.get('company_info')),
        'business_info': extract_words_with_target_tags(value.get('business_info')),
    })

company_info = [' '.join(value.get('company_info')) for value in data.values()]
business_info = [' '.join(value.get('business_info')) for value in data.values()]

company_names = pd.DataFrame([value.get('company_abbreviation_name (kor)') for value in data.values()], columns=['company_name'])

100%|██████████| 790/790 [00:06<00:00, 119.88it/s]
100%|██████████| 790/790 [00:47<00:00, 16.65it/s]


In [48]:
company_info = [sentence.split() for sentence in company_info]
business_info = [sentence.split() for sentence in business_info]

company_info = [[word for word in sentence if len(word)>1] for sentence in company_info]

In [184]:
dictionary = corpora.Dictionary(company_info)
corpus = [dictionary.doc2bow(text) for text in company_info]

In [185]:
lda_model = gensim.models.LdaModel(
    corpus,
    num_topics=100,
    id2word=dictionary,
    passes=15,
)

In [186]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [138]:
topic_data = pd.DataFrame(columns=['company_name', 'topic_num', 'ratio'])
for i, topic_list in tqdm(enumerate(lda_model[corpus])):
    
    sorted_ratio = sorted(topic_list, key=lambda x: x[-1], reverse=True)
    topic_num, ratio = sorted_ratio[-1]
    temp = pd.DataFrame({'company_name': company_names.company_name[i], 'topic_num': topic_num+1, 'ratio': ratio}, index=[0])

    topic_data = pd.concat([
        topic_data,
        temp
    ])


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.

790it [00:00, 1086.62it/s]


In [162]:
topic_data.query('topic_num==25')

Unnamed: 0,company_name,topic_num,ratio
0,광동제약,25,0.01292
0,대신증권,25,0.074125
0,롯데칠성,25,0.023187
0,삼성증권,25,0.054875
0,유안타증권,25,0.034803
0,페이퍼코리아,25,0.066799
