In [72]:
import numpy as np
import json

In [54]:
corpus = np.load('corpus.npy')

In [55]:
k = 10
objective = list()
centers = corpus[np.random.choice(np.arange(corpus.shape[0]), k, replace=False)]
groups = np.array([np.argmin([np.linalg.norm(center - x) for center in centers]) for x in corpus])
objective.append(
    np.array([np.linalg.norm(corpus[i] - centers[groups[i]]) for i in range(corpus.shape[0])]).sum()
)

i = 0
while True:
    centers = np.array([corpus[groups == g].mean(axis=0) for g in range(k)])
    groups = np.array([np.argmin([np.linalg.norm(center - x) for center in centers]) for x in corpus])
    objective.append(
        np.array([np.linalg.norm(corpus[i] - centers[groups[i]]) for i in range(corpus.shape[0])]).mean()
    )
    if i > 2 and ((objective[-1] - objective[-2]) + (objective[-2] - objective[-3])) < 1.0:
        break
    i += 1

In [56]:
objective[-1]

np.float64(28.881554138543702)

In [57]:
corpus[groups == 0][0]

array([0, 0, 1, ..., 0, 0, 0])

How many articles in each cluster

In [58]:
{g: corpus[groups == g].shape[0] for g in range(10)}

{0: 123, 1: 70, 2: 225, 3: 224, 4: 167, 5: 53, 6: 631, 7: 283, 8: 56, 9: 1172}

Identify the words present in each cluster

In [59]:
words = np.load('words.npy')

In [60]:
cluster_word_histogram = corpus[groups == 5].sum(axis=0)

In [61]:
non_zero_i = np.where(cluster_word_histogram > 0)

In [62]:
cluster_hist_norm = cluster_word_histogram / np.linalg.norm(cluster_word_histogram)

In [63]:
cluster_hist_norm.dtype

dtype('float64')

In [64]:
print(*words[non_zero_i])



In [65]:
values_i = np.argsort(cluster_hist_norm)[::-1]

In [66]:
cluster_word_histogram[values_i[0]]

np.int64(1872)

In [67]:
words[values_i]

array(['russia', 'rain', 'russian', ..., 'locker', 'contender',
       'accuracy'], dtype='<U16')

In [68]:
cluster_word_histogram.argmax()

np.int64(24)

In [70]:
words[24]

np.str_('russia')

In [71]:
corpus

array([[4, 2, 5, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 7, 9, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [96]:
articles = list()
with open('articles.json', 'r', encoding='utf-8') as f:
    articles = json.load(f)

In [91]:
articles = np.array(articles)

In [97]:
articles[0]

['2023-10-07',
 '10-year-old boy killed in Kharkiv was asleep when Russian missile hit, Ukraine says',
 'https://edition.cnn.com/2023/10/06/europe/10-year-old-ukraine-boy-asleep-when-russia-drone-strike-intl-hnk/index.html',
 '10-year-old-ukraine-boy-asleep-when-russia-drone-strike-intl-hnk',
 'europe']

In [92]:
articles[groups == 9]

IndexError: boolean index did not match indexed array along axis 0; size of axis is 3015 but size of corresponding boolean axis is 3004

In [98]:
import os
article_names = list(map(lambda name: name[:-5], os.listdir('nouns/')))

In [100]:
for article in articles:
    if article[3] in article_names:
        continue
    articles.remove(article)

In [101]:
len(articles)

3015

In [81]:
articles

array(['10-year-old-ukraine-boy-asleep-when-russia-drone-strike-intl-hnk',
       '1000-russian-tanks-destroyed-ukraine-war-intl-hnk-ml',
       '2015-paris-terror-attacks-fast-facts', ...,
       'zlatan-ibrahimovic-oldest-ever-goalscorer-serie-a-spt-intl',
       'zlatan-ibrahimovic-retires-intl-hnk-spt',
       'zoo-elephant-peels-bananas-scn'], dtype='<U97')

In [87]:
nodes = list()

In [88]:
for article in articles[groups == 0]:
    nodes.append({
        'id': article[1],
        'url': article[2]
    })


In [89]:
nodes

[{'id': 'l', 'url': 'e'},
 {'id': 'l', 'url': 'e'},
 {'id': 'n', 'url': 'i'},
 {'id': 'r', 'url': 'm'},
 {'id': 'a', 'url': 'k'},
 {'id': 'e', 'url': 'l'},
 {'id': 'e', 'url': 'l'},
 {'id': 'e', 'url': 'l'},
 {'id': 'o', 'url': 'x'},
 {'id': 'h', 'url': 'i'},
 {'id': 'u', 'url': 'r'},
 {'id': 'u', 'url': 'r'},
 {'id': 'u', 'url': 'r'},
 {'id': 'v', 'url': 'a'},
 {'id': 'v', 'url': 'a'},
 {'id': 'e', 'url': 'n'},
 {'id': 'e', 'url': 'o'},
 {'id': 'g', 'url': 'o'},
 {'id': 'h', 'url': 'e'},
 {'id': 'o', 'url': 's'},
 {'id': 'r', 'url': 'e'},
 {'id': 'u', 'url': 'k'},
 {'id': 'u', 'url': 'k'},
 {'id': 'u', 'url': 'k'},
 {'id': 'u', 'url': 'k'},
 {'id': 'u', 'url': 'k'},
 {'id': 'a', 'url': 'k'},
 {'id': 'e', 'url': 'd'},
 {'id': 'i', 'url': '6'},
 {'id': 'o', 'url': 'l'},
 {'id': 'o', 'url': 's'},
 {'id': 'a', 'url': 'v'},
 {'id': 'a', 'url': 'v'},
 {'id': 'o', 'url': 'r'},
 {'id': 'e', 'url': 'n'},
 {'id': 'o', 'url': 'l'},
 {'id': 'r', 'url': 'i'},
 {'id': 'r', 'url': 'i'},
 {'id': 'r',