In [None]:
!pip install gensim
import gensim
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import pandas as pd
!pip install wordcloud
from wordcloud import WordCloud
! pip install adjustText
from sklearn.manifold import TSNE
import numpy as np
from adjustText import adjust_text
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter

In [2]:
korea_df = pd.read_csv('/content/out.csv')

In [None]:
korea_df.head()

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('korea.bin', binary=True)

In [None]:
freq_tokens_kr = model.key_to_index
words_kr = list(freq_tokens_kr.keys())[:200]
print(words_kr)

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(freq_tokens_kr)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [11]:
fpath = "/content/NanumGothic-Regular.ttf"

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=fpath).generate_from_frequencies(freq_tokens_kr)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [65]:
font_prop1 = FontProperties(fname=fpath)
font_prop = FontProperties(fname=fpath, size=10)

In [None]:
word_vectors = np.array([model[word] for word in words_kr])
word_vectors

tsne = TSNE(n_components=2, random_state=17, perplexity=30, n_iter=1000)
tsne_vectors = tsne.fit_transform(word_vectors)
tsne_vectors[:10]

n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=17)
clusters = kmeans.fit_predict(word_vectors)
print(clusters)
print(len(clusters))

In [17]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
plt.figure(figsize=(14, 12))
for i, word in enumerate(words_kr):
    x, y = tsne_vectors[i, :]
    plt.scatter(x, y, c=[plt.cm.Spectral(clusters[i] / n_clusters)], marker='o', alpha=0.6)
    plt.text(x + 0.1, y + 0.1, word, fontsize=10, fontproperties=font_prop)

plt.title("Топ-200 слов")
plt.show()

In [None]:
words_kr_100 = list(freq_tokens_kr.keys())[:100]
print(words_kr_100)

In [31]:
fig = go.Figure()
for i, word in enumerate(words_kr):
    x, y = tsne_vectors[i, :]
    color = px.colors.qualitative.Plotly[clusters[i] % len(px.colors.qualitative.Plotly)]
    fig.add_trace(
        go.Scatter(
            x=[x],
            y=[y],
            #mode="markers", #если не нужны надписи, то раскомментировать это, а закомментировать следующее
            mode="markers+text",
            text=[word],
            textposition="top center",
            marker=dict(size=8, color=color, opacity=0.7),
            hovertemplate=f"Слово: {word}<br>Кластер: {clusters[i]}<br>t-SNE 1: {x:.2f}<br>t-SNE 2: {y:.2f}",
            textfont=dict(family=font_prop1.get_name(), size=12)
        )
    )

fig.update_layout(
    title="Топ-200 слов",
    showlegend=False,
    hovermode="closest",
    width=1000,
    height=800,
    template="plotly_white"
)
fig.show()

In [None]:
top_words = [tupl[0] for tupl in model.most_similar(['문화','대한'], topn=20)]
print(top_words)

In [None]:
words = top_words + ['문화','대한']
vectors = [model[word] for word in words]
vectors = np.array(vectors)
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
vectors_2d = tsne.fit_transform(np.array(vectors))
plt.figure(figsize=(10, 6))
texts = []
colors = plt.cm.tab20(np.linspace(0, 1, len(words)))
for i, word in enumerate(words):
    plt.scatter(
        vectors_2d[i, 0], vectors_2d[i, 1],
        c=[colors[i]], s=100, marker='o',
        alpha=0.7
    )
    texts.append(
        plt.text(
            vectors_2d[i, 0] + 0.5, vectors_2d[i, 1] + 0.5,
            word, fontsize=12, fontproperties=font_prop
        )
    )

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5)) #укладка и подрисовка стрелочек

plt.title("Ближайшие соседи слов 'культура','Корея' в южнокорейских статьях о диаспорах",
          fontsize=16, fontfamily='Arial Unicode MS', pad=15, weight='bold')
plt.tight_layout()
plt.show()