In [1]:
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
!pip install stanza
import stanza
stanza.download("ko")
nlp_stanza = stanza.Pipeline(lang="ko", processors="tokenize, pos, lemma, depparse")

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ko (Korean) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ko/resolve/v1.11.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/ko/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ko (Korean):
| Processor | Package        |
------------------------------
| tokenize  | kaist          |
| pos       | kaist_nocharlm |
| lemma     | kaist_nocharlm |
| depparse  | kaist_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
path = 'kor/'
corpus = []
for filename in tqdm(os.listdir(path)):
    with open(path + filename, encoding='utf-8') as txt:
        text = txt.read()
        corpus.append(text)

100%|██████████| 29/29 [00:00<00:00, 15918.70it/s]


In [15]:
def cleanizer(corpus):
    a1 = ''.join(corpus)
    a1 = re.sub(r'\n', ' ', a1)
    a1 = re.sub(r'[A-Za-z]', '', a1)
    a1 = re.sub(r'\d', '', a1)
    a1 = re.sub(r'\(', '', a1)
    clean_corpus = re.sub(r'\)', '', a1)
    return clean_corpus


In [16]:
clean_corpus_kr = cleanizer(corpus)
clean_corpus_kr[:100]

'백설白雪이 분분紛紛한 날에 - 임의직  백설이 분분한 날에 천지天地가 다 희거다 우의羽衣를 떨쳐 입고 구당丘堂에 올라가니 어즈버 천산백옥경天山白玉京을 미처 본가 하노라 임의직任義直'

In [17]:
corp_doc = nlp_stanza(clean_corpus_kr)

In [18]:
def stanza_to_df(corp_doc):
    list_of_rows = []
    counter = 0
    for sentence in corp_doc.sentences:
        counter += 1
        for word in sentence.words:
            list_of_rows.append([counter, word.id, word.text, word.lemma, word.upos, word.deprel, word.head, sentence.words[word.head-1].text])

    df_sentence = pd.DataFrame(list_of_rows, columns=['sent_id', 'id', 'token', 'lemma', 'pos', 'synt_tag', 'head_id', 'head_tok'])
    return df_sentence


In [19]:
df_kr = stanza_to_df(corp_doc)
df_kr[:10]

Unnamed: 0,sent_id,id,token,lemma,pos,synt_tag,head_id,head_tok
0,1,1,백설白雪이,백설+白雪+이,NOUN,nsubj,2,분분紛紛한
1,1,2,분분紛紛한,분분紛紛+하+ㄴ,VERB,amod,3,날에
2,1,3,날에,날+에,ADV,obl,11,희거다
3,1,4,-,-,PUNCT,punct,3,날에
4,1,5,임의직,임의직,NOUN,compound,6,백설이
5,1,6,백설이,백설+이,NOUN,nsubj,7,분분한
6,1,7,분분한,분분+하+ㄴ,VERB,acl,8,날에
7,1,8,날에,날+에,ADV,obl,11,희거다
8,1,9,천지天地가,천지天地+가,NOUN,nsubj,11,희거다
9,1,10,다,다,ADV,advmod,11,희거다


In [21]:
df_kr_clean = df_kr.loc[(df_kr['synt_tag'] != 'punct')]
df_kr_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11473 entries, 0 to 13164
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sent_id   11473 non-null  int64 
 1   id        11473 non-null  int64 
 2   token     11473 non-null  object
 3   lemma     11473 non-null  object
 4   pos       11473 non-null  object
 5   synt_tag  11473 non-null  object
 6   head_id   11473 non-null  int64 
 7   head_tok  11473 non-null  object
dtypes: int64(3), object(5)
memory usage: 806.7+ KB


In [33]:
grouped_tokens = df_kr_clean.groupby('sent_id')['token'].apply(list)
sentences = grouped_tokens.tolist()

In [30]:
df_kr_clean.to_csv('out.csv', index=False)

In [22]:
tokens_kr = df_kr_clean.get('token')
tokens_list_kr = tokens_kr.to_list()
print(len(set(tokens_list_kr)))

6076


In [24]:
def skipgrammer(tokens, window_size):
    skip_grams = []
    for index in range(len(tokens)):
        target_word = tokens[index]
        start = max(0, index - window_size)
        end = min(len(tokens), index + window_size + 1)

        for s_index in range(start, end):
            if index != s_index:
                context = tokens[s_index]
                skip_grams.append((target_word, context))
    return skip_grams

In [25]:
skip_grams_zorg = skipgrammer(tokens_list_kr, 3)

In [26]:
from collections import Counter

In [27]:
target = '지난'
target_skipgrams = []
for skipgram in skip_grams_zorg:
    if skipgram[0] == target or skipgram[1] == target:
        target_skipgrams.append(skipgram)

target_skipgram_counts = Counter(target_skipgrams)
print(target_skipgram_counts.most_common())

[(('지난', '월'), 33), (('월', '지난'), 33), (('지난', '일'), 22), (('일', '지난'), 22), (('지난', '일부터'), 6), (('일부터', '지난'), 6), (('지난', '열린'), 6), (('열린', '지난'), 6), (('앞서', '지난'), 4), (('지난', '앞서'), 4), (('지난', '일까지'), 4), (('일까지', '지난'), 4), (('뉴욕한인회', '지난'), 4), (('지난', '뉴욕한인회'), 4), (('동부한인회가', '지난'), 4), (('지난', '동부한인회가'), 4), (('지난', '에서'), 4), (('에서', '지난'), 4), (('지난', '년'), 3), (('년', '지난'), 3), (('‘', '지난'), 3), (('지난', '‘'), 3), (('퀸즈', '지난'), 3), (('지난', '퀸즈'), 3), (('서울=연합뉴스', '지난'), 3), (('지난', '서울=연합뉴스'), 3), (('김', '지난'), 3), (('지난', '김'), 3), (('지난', '김경화'), 3), (('김경화', '지난'), 3), (('지난', '코리안퍼레이드'), 3), (('코리안퍼레이드', '지난'), 3), (('지난', '참가'), 3), (('참가', '지난'), 3), (('허성호', '지난'), 2), (('지난', '허성호'), 2), (('지난', '미국'), 2), (('미국', '지난'), 2), (('이명석', '지난'), 2), (('회장', '지난'), 2), (('지난', '이명석'), 2), (('지난', '회장'), 2), (('지난', '저녁'), 2), (('저녁', '지난'), 2), (('지난', '뉴욕한인회장우측이'), 2), (('뉴욕한인회장우측이', '지난'), 2), (('지난', '오는'), 2), (('오는', '지난'), 2), (('있다', '지난'), 2), (('지난', '있다'), 2

In [29]:
!pip install gensim
import gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [34]:
w2v_kr = gensim.models.Word2Vec(sentences, vector_size=300, window=5, min_count=2, sg=0, epochs=5)

In [35]:
freq_tokens_kr = w2v_kr.wv.key_to_index
words_kr = list(freq_tokens_kr.keys())[:200]
print(words_kr)

['월', '있다', '그는', '년', '있는', '일', '지난', '했다', '수', '이', '회장은', '박', '재외동포', '전', '등', '한인', '청년', '세', '김', '한국', '및', '이날', '회장', '모국', '밝혔다', '한', '‘', '이번', '통해', '이어', '하고', '연수', '캄보디아', '차세대', '함께', '기자', '만', '우리', '초청', '위한', '그', '서울', '서울=연합뉴스', '회장이', '한인회', '더', '두', '같은', '제차', '것은', '것이다', '한다', '하는', '대', '위해', '뉴욕한인회', '에서', '열린', '제대', '해외', '며', '고', '신임', '명이', '한국에서', '는', '큰', '한국어', '당시', '그의', '세계', '이미지', '제공', '재외동포협력센터', '이명석', '낙스', '명', '차세대동포', '후', '원', '일부터', '대통령은', '때', '외교부', '취·창업', '글로벌', '김영근', '메이필드호텔에서', '재일본대한민국청년회', '말했다', '여', '대해', '특히', '최근', '손민호', '또', '▲', '뒤', '할', '있도록', '우즈베키스탄', '참여했다', '설계', '등을', '창업', "연수'", '강서구', '대한', '이기훈', '않았다', '이런', '온', '도산', '행사를', '차', '민주평통', '코리안', '청년들이', '일본', '박현수', '있었다', '설명했다', '지역', '이에', '많은', '가운데', '이사장', '이후', '권예순', '추성희', '하지만', '모두', '억', '사라', '날', '그가', '화백의', '참여해', '대한인국민회', '현지', '월드옥타', '박종범', '개국', '다양한', '덧붙였다', '사람이', '적극', '싶다', '방문', '번째', '된다', '회장으로', '기업', '따르면', '미국', '것', '

In [40]:
big_string=''
for i in range(len(tokens_list_kr)):
    big_string+=(tokens_list_kr[i]+' ')

In [38]:
!pip install wordcloud

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS



In [36]:
print('Для корпуса: ', w2v_kr.wv.most_similar('서울', topn=10))

Для корпуса:  [('하는', 0.2567870318889618), ('의원', 0.23014937341213226), ('월', 0.21951939165592194), ('회장', 0.21694637835025787), ('년', 0.21612884104251862), ('대', 0.20777520537376404), ('설계', 0.20474907755851746), ('양해각서에', 0.20105962455272675), ('그는', 0.19945412874221802), ('및', 0.18957750499248505)]


In [37]:
w2v_kr.wv.save_word2vec_format('korea.bin', binary=True)

In [42]:
clean_texts = []
text_no_stop = ' '.join([token for token in tokens_list_kr])
clean_texts.append(text_no_stop)

In [43]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_texts)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf = tfidf_matrix.toarray().flatten()
top_indices = tfidf.argsort()[-20:][::-1]
print([(feature_names[i], tfidf[i]) for i in top_indices])

[('있다', np.float64(0.3102742369657244)), ('그는', np.float64(0.26430768334117266)), ('있는', np.float64(0.23749386039351747)), ('지난', np.float64(0.20301894517510363)), ('했다', np.float64(0.19535785290434501)), ('서울', np.float64(0.15322184541517256)), ('회장은', np.float64(0.1417302070090346)), ('재외동포', np.float64(0.12257747633213804)), ('연수', np.float64(0.11874693019675873)), ('한국', np.float64(0.11491638406137941)), ('청년', np.float64(0.11491638406137941)), ('한인', np.float64(0.1110858379260001)), ('회장', np.float64(0.10342474565524147)), ('이날', np.float64(0.09959419951986216)), ('모국', np.float64(0.09959419951986216)), ('이번', np.float64(0.09193310724910353)), ('밝혔다', np.float64(0.08810256111372422)), ('창업', np.float64(0.08810256111372422)), ('차세대', np.float64(0.08427201497834491)), ('통해', np.float64(0.08427201497834491))]
