In [13]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [1]:
from lxml import etree
from pathlib import Path

In [2]:
files = list(Path("tlg0012").glob("./**/*perseus-eng*.xml"))

In [3]:
TEI_NS = "http://www.tei-c.org/ns/1.0"
XML_NS = "http://www.w3.org/XML/1998/namespace"

NAMESPACES = {
    "tei": TEI_NS,
    "xml": XML_NS,
}

In [4]:
for file in files:
    print(file)
    tree = etree.parse(file)
    text = tree.xpath(f"//tei:div[@subtype='card']//text()", namespaces=NAMESPACES)
    
    cleaned_text = []
    for t in text:
        if t.strip() != "":
            cleaned_text.append(t)

    if len(cleaned_text) > 0:
        with open(str(file).split("/")[-1].replace(".xml", ".txt"), "w+") as f:
            f.write('\n'.join(cleaned_text))
    

tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml
tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml
tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml
tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml


In [5]:
from collections import Counter

text_files = list(Path(".").glob("tlg0012.tlg00*.perseus-eng*.txt"))

counts = {}

for t in text_files:
    name = str(t)

    with open(t) as f:
        text = f.read().lower().split()
        counts[name] = Counter(text)


In [6]:
term = 'odysseus'

df_ulysses = 0

for _, els in counts.items():
    if term in els:
        df_ulysses += 1

df_ulysses

4

In [7]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
from nltk.tokenize import word_tokenize

tokenized_texts = {}

text_files = Path(".").glob("tlg0012.tlg00*.perseus-eng[1-4].txt")

for file in text_files:
    name = str(file)

    with open(file) as f:
        text = f.read().lower()
        tokens = word_tokenize(text)

        print(f"There are {len(tokens)} tokens in {name}.")

        tokenized_texts[name] = tokens


There are 200630 tokens in tlg0012.tlg001.perseus-eng3.txt.
There are 135463 tokens in tlg0012.tlg002.perseus-eng4.txt.
There are 175611 tokens in tlg0012.tlg001.perseus-eng4.txt.
There are 152631 tokens in tlg0012.tlg002.perseus-eng3.txt.


In [10]:
from collections import Counter

# Using our `tokenized_texts` dictionary, we'll iterate
# through each key-value pair — remember, the keys are
# filenames and the values are lists of tokens.
# We'll get a count of the tokens by passing the list to
# `Counter`, then we'll change the value for that key to
# a dictionary with its own keys, `tokens` and `counts`.

for filename, tokens in tokenized_texts.items():
    counts = Counter(tokens)

    tokenized_texts[filename] = {"tokens": tokens, "counts": counts}

In [11]:
df_achilles = 0
df_odysseus = 0

for filename, values in tokenized_texts.items():
    if "odysseus" in values['counts']:
        df_odysseus += 1
    
    if "achilles" in values["counts"]:
        df_achilles += 1

from math import log10

n_docs = len(tokenized_texts.keys())

idf_achilles = log10(n_docs / df_achilles)
idf_odysseus = log10(n_docs / df_odysseus)

print(idf_achilles)

0.0


In [12]:
for filename, values in tokenized_texts.items():
    total_terms = len(values['tokens'])

    tf_achilles = values['counts']['achilles'] / total_terms
    tf_odysseus = values['counts']['odysseus'] / total_terms


    tf_idf_achilles = tf_achilles * idf_achilles
    tf_idf_odysseus = tf_odysseus * idf_odysseus

    print(f"""In {filename}:
TF of achilles: {tf_achilles}
TF of odysseus: {tf_odysseus}
TF-IDF of achilles: {tf_idf_achilles}
TF-IDF of odysseus: {tf_idf_odysseus}
""")

In tlg0012.tlg001.perseus-eng3.txt:
TF of achilles: 0.002043562777251657
TF of odysseus: 0.000637990330459054
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng4.txt:
TF of achilles: 0.0001254955227626732
TF of odysseus: 0.0042816119530794386
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg001.perseus-eng4.txt:
TF of achilles: 0.002403038534032606
TF of odysseus: 0.0007061061095261686
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0

In tlg0012.tlg002.perseus-eng3.txt:
TF of achilles: 0.0001048279838302835
TF of odysseus: 0.0041603606082643765
TF-IDF of achilles: 0.0
TF-IDF of odysseus: 0.0



In [14]:
my_list = [1, 1, 2, 3, 3]

set(my_list)

{1, 2, 3}

In [15]:
non_universal_terms = {}

for filename, values in tokenized_texts.items():
    my_set = set(values['counts'].keys())

    for other_file, other_values in tokenized_texts.items():
        if other_file != filename:
            my_set -= set(other_values['counts'].keys())
    
    non_universal_terms[filename] = my_set

non_universal_terms

{'tlg0012.tlg001.perseus-eng3.txt': {'draggeth',
  'upbare',
  'apportion',
  'midmost',
  'reviler',
  'son—he',
  'chiefly',
  'over-powering',
  'alcmaon',
  'heaped-up',
  'rum',
  'tumultuously',
  'raze',
  'ln',
  'overcometh',
  'transgresseth',
  'eteocles',
  'switft-footed',
  'ones—even',
  'speweth',
  'hecabe',
  '315.1',
  'swiftships',
  'breaketh',
  'oncoming',
  'waging',
  'centre',
  'asclepius',
  '13.1',
  'astypylus',
  'fire-dogs',
  'spurted',
  'rambling',
  'hike',
  'counteth',
  '—no',
  'prevail—thereon',
  'endlessly',
  'them—even',
  'gorged',
  'bethinking',
  'layeth',
  'ox-goad',
  'maeonian',
  'evippus',
  'lifetime',
  'half-divine—of',
  'company.',
  'discern',
  'coon',
  'avoweth',
  'unmarked',
  'harry',
  'vuhtures',
  'iphinous',
  'seemeth',
  'sinneth',
  'stratia',
  'chid',
  'lyres',
  'dawned',
  'raiseth',
  'pheme',
  'loud-lowing',
  'pike',
  'man-slaying',
  'bottom-land',
  'breathing-space',
  'presumptuous',
  'cherishes',


In [34]:
df_clymenus = 0

for filename, values in tokenized_texts.items():
    
    if "clymenus" in values["counts"]:
        df_clymenus += 1

idf_clymenus = log10(n_docs / df_clymenus)
tf_clymenus = values['counts']['clymenus'] / total_terms  
tf_idf_clymenus = tf_clymenus * idf_clymenus

print(f"""In {filename}:
TF of clymenus: {tf_clymenus}
TF-IDF of clymenus: {tf_idf_clymenus}
""")

In tlg0012.tlg002.perseus-eng3.txt:
TF of clymenus: 6.551748989392718e-06
TF-IDF of clymenus: 3.944545939736766e-06



In [36]:
df_irus = 0

for filename, values in tokenized_texts.items():
    
    if "irus" in values["counts"]:
        df_irus += 1

idf_irus = log10(n_docs / df_irus)
tf_irus = values['counts']['irus'] / total_terms  
tf_idf_irus = tf_irus * idf_irus

print(f"""In {filename}:
TF of irus: {tf_irus}
TF-IDF of irus: {tf_idf_irus}
""")

In tlg0012.tlg002.perseus-eng3.txt:
TF of irus: 7.862098787271262e-05
TF-IDF of irus: 4.73345512768412e-05



### TF-IDF questions

In the sets that we created, I noticed that there were still a few errors with the tokenization. Some of the words with dashes were still combined as single tokens, like “son–he” and “me–my,” and there were also a few tokens with periods in them (like “fortune.” and “much.”). I was able to test the TF and TF-IDF of some of the unique names mentioned in the documents like Clymenus and Irus. In this case, the TF-IDF is weighing how important each term is in a document in comparison to the entire corpus. The higher values would be showing that a word is more unique, because it appears in fewer documents. The lower values (or values of 0 like we saw for “Achilles” and “Odysseus”) occur when the term is less unique, because it appears in more of the documents. The names “Clymenus” and “Irus,” even though they did not have particularly high document frequencies, both had higher TF-IDF values because they only appeared in one of the documents.

Finding the terms unique to each document  and their TF-IDF score could be a useful way to pick out “keywords” and an idea of what each text in a corpus is about, without picking up the words that are common in all of the documents. It might also be helpful to identify which documents are the most similar by comparing the relative scores of each term, which might be useful for dividing a large corpus of documents into different categories based on what keywords or themes they have. The reading from Prasoon Singh also mentioned that TF-IDF is one of the most common calculations used for text-based recommender systems in digital libraries. I can imagine this would be useful when searching for academic articles on a specific topic – if there are words that are uniquely common to the article you are reading, the system can recommend other texts that use similar terms without capturing terms that occur frequently in the entire corpus of articles. 