In [93]:
import os
import json
import numpy as np

In [94]:
ldir = os.listdir('nouns/')

In [95]:
ldir

['10-year-old-ukraine-boy-asleep-when-russia-drone-strike-intl-hnk.json',
 '1000-russian-tanks-destroyed-ukraine-war-intl-hnk-ml.json',
 '2015-paris-terror-attacks-fast-facts.json',
 '2023-march-madness-mens-final-four-preview-spt-intl.json',
 '2023-nfl-draft-how-to-watch-preview-spt-intl.json',
 '2023-womens-world-cup-media-rights-infantino-spt-intl.json',
 '70-shot-rally-kubler-khachanov-australian-open-spt-intl.json',
 'aaron-judge-catch-yankees-jake-burger-spt-intl.json',
 'aaron-judge-opening-day-home-run-giants-spt-intl.json',
 'aaron-rodgers-darkness-retreat-to-start-intl-spt.json',
 'aaron-rodgers-decision-darkness-spt-intl.json',
 'aaron-rodgers-introduced-new-york-jets-spt-intl.json',
 'aaron-rodgers-joe-douglas-new-york-jets-nfl-spt-intl.json',
 'aaron-rodgers-new-york-jets-spt-intl.json',
 'aaron-rodgers-nfl-future-darkness-intl-spt.json',
 'aaron-rodgers-out-for-2023-season-injury-spt-intl.json',
 'aaron-rodgers-packers-jets-nfl-spt-intl.json',
 'aaron-rodgers-pebble-beach

In [96]:
articles = dict()

In [97]:
for filename in ldir:
    with open(f'nouns/{filename}') as f:
        articles[filename[:-5]] = json.load(f)

In [98]:
articles

{'10-year-old-ukraine-boy-asleep-when-russia-drone-strike-intl-hnk': ['boy',
  'strike',
  'city',
  'kharkiv',
  'friday',
  'picture',
  'show',
  'body',
  'defense',
  'show',
  'body',
  'blanket',
  'dust',
  'body',
  'morning',
  'kharkiv',
  'boy',
  'ministry',
  'iskander',
  'missile',
  'reach',
  'kharkiv',
  'border',
  'city',
  'grandmother',
  'brother',
  'state',
  'administration',
  'chief',
  'president',
  'volodymyr',
  'address',
  'brother',
  'russia',
  'result',
  'morning',
  'october',
  'severity',
  'prosecutor',
  'attack',
  'kharkiv',
  'day',
  'missile',
  'strike',
  'village',
  'hroza',
  'thursday',
  'child',
  'hroza',
  'village',
  'region',
  'grocery',
  'store',
  'cafe',
  'lady',
  'olena',
  'attack',
  'cruel',
  'sense',
  'war',
  'federation',
  'minister',
  'dmitro',
  'kuleba',
  'support',
  'war',
  'russia',
  'strike'],
 '1000-russian-tanks-destroyed-ukraine-war-intl-hnk-ml': ['russia',
  'tank',
  'fleet',
  'war',
  'inf

In [99]:
noun_frequency = dict()

In [100]:
for article_nouns in articles.values():
    for noun in set(article_nouns):
        if not noun_frequency.get(noun):
            noun_frequency[noun] = 1
        else:
            noun_frequency[noun] += 1

In [101]:
# How many word-articles count dictionary
noun_frequency

{'body': 333,
 'iskander': 13,
 'brother': 109,
 'strike': 190,
 'support': 621,
 'state': 750,
 'volodymyr': 175,
 'dust': 19,
 'friday': 307,
 'border': 148,
 'russia': 504,
 'war': 607,
 'federation': 192,
 'kuleba': 15,
 'picture': 104,
 'result': 275,
 'grocery': 7,
 'store': 32,
 'day': 800,
 'ministry': 350,
 'sense': 134,
 'administration': 130,
 'chief': 186,
 'prosecutor': 90,
 'olena': 6,
 'reach': 183,
 'morning': 267,
 'blanket': 11,
 'cruel': 19,
 'kharkiv': 37,
 'defense': 499,
 'address': 105,
 'hroza': 6,
 'child': 109,
 'region': 330,
 'lady': 20,
 'dmitro': 1,
 'boy': 57,
 'missile': 120,
 'president': 864,
 'severity': 19,
 'grandmother': 16,
 'october': 80,
 'attack': 301,
 'show': 269,
 'city': 740,
 'thursday': 340,
 'cafe': 8,
 'minister': 584,
 'village': 121,
 'information': 268,
 'evidence': 225,
 'round': 312,
 'group': 646,
 'fleet': 46,
 'half': 273,
 'replacement': 19,
 'fighting': 52,
 'blast': 37,
 'moscow': 239,
 'source': 138,
 'storage': 27,
 'monito

In [102]:
# Total amount of nouns identified in all the articles
len(noun_frequency)

21676

In [103]:
word_dict_occ = dict()
for (key, value) in noun_frequency.items():
    if value >= 20:
        word_dict_occ[key] = value
word_dict = word_dict_occ

In [104]:
# Total amount of nouns in at least 20 articles
len(word_dict)

2035

Words that can be found in at least 20 articles

In [105]:
words = np.array(list(word_dict.keys()))

In [106]:
words

array(['body', 'brother', 'strike', ..., 'braverman', 'francis',
       'gerasimov'], dtype='<U16')

In [107]:
np.save('words.npy', words)

### Compute word histogram for articles ###

In [108]:
zero_histogram = {word: 0 for word in word_dict.keys()}
corpus = dict()

for article_name, article_nouns in articles.items():
    histogram = zero_histogram.copy()
    for word_element in words:
        histogram[word_element] = article_nouns.count(word_element)
    corpus[article_name] = histogram

In [109]:
with open('corpus.json', 'w', encoding='utf-8') as file:
    json.dump(corpus, file, indent=2)

### Process corpus as numpy array ###

In [110]:
corpus = None

In [111]:
with open('corpus.json', 'r') as file:
    corpus = json.load(file)

In [112]:
array_corpus = np.array([[freq for freq in article_nouns.values()] for article_nouns in corpus.values()])

In [113]:
np.save('corpus.npy', array_corpus)

In [114]:
with open('artice_names.json', 'w', encoding='utf-8') as f:
    json.dump(list(corpus.keys()), f)