# Build Stoplist - Top 50 Words

This notebook builds a stoplist by analyzing word frequencies across documents in the `docs` directory.


In [1]:
import pathlib, re, collections

DOCS = pathlib.Path("docs")
OUT = pathlib.Path("stoplist_top50.txt")

TOKEN_SPLIT = re.compile(r"[^A-Za-z]+")


In [2]:
def build_stoplist_top50():
    counter = collections.Counter()

    files = sorted(DOCS.glob("*.txt"))
    if not files:
        raise SystemExit("No files found in ./docs. Run fetch_wiki_docs.py first.")

    for p in files:
        txt = p.read_text(encoding="utf-8").lower()
        # drop header line "# TITLE: ..."
        txt = re.sub(r"^# title: .*?\n\n", "", txt, flags=re.IGNORECASE)
        tokens = [t for t in TOKEN_SPLIT.split(txt) if t and (len(t) > 1 or t in ("a", "i"))]
        counter.update(tokens)

    top50 = counter.most_common(50)

    # save
    with OUT.open("w", encoding="utf-8") as f:
        f.write("Top 50 words (word\tcount)\n")
        for w, c in top50:
            f.write(f"{w}\t{c}\n")
    
    return top50


In [3]:
# Run the function to build the stoplist
top50 = build_stoplist_top50()
print(f"Saved: {OUT.resolve()}")


Saved: /Users/ramidaood/UNI/אחזור_1/stoplist_top50.txt


In [4]:
# Generate and display markdown table
from IPython.display import Markdown, display

markdown_table = "## Top 50 Words\n\n| Rank | Word | Count |\n|------|------|-------|\n"
for i, (w, c) in enumerate(top50, 1):
    markdown_table += f"| {i} | {w} | {c} |\n"

display(Markdown(markdown_table))


## Top 50 Words

| Rank | Word | Count |
|------|------|-------|
| 1 | the | 391 |
| 2 | of | 208 |
| 3 | and | 164 |
| 4 | in | 156 |
| 5 | a | 135 |
| 6 | on | 114 |
| 7 | to | 87 |
| 8 | was | 80 |
| 9 | retrieved | 79 |
| 10 | with | 74 |
| 11 | you | 67 |
| 12 | for | 54 |
| 13 | orlandi | 49 |
| 14 | by | 48 |
| 15 | county | 47 |
| 16 | is | 46 |
| 17 | otto | 43 |
| 18 | from | 42 |
| 19 | at | 36 |
| 20 | music | 36 |
| 21 | it | 35 |
| 22 | were | 33 |
| 23 | as | 32 |
| 24 | his | 32 |
| 25 | dance | 29 |
| 26 | chaplin | 29 |
| 27 | route | 25 |
| 28 | census | 24 |
| 29 | so | 24 |
| 30 | an | 21 |
| 31 | can | 21 |
| 32 | or | 20 |
| 33 | espa | 20 |
| 34 | single | 19 |
| 35 | washington | 19 |
| 36 | season | 19 |
| 37 | library | 18 |
| 38 | we | 18 |
| 39 | rave | 18 |
| 40 | july | 17 |
| 41 | not | 17 |
| 42 | de | 17 |
| 43 | government | 17 |
| 44 | archived | 17 |
| 45 | original | 17 |
| 46 | beatport | 17 |
| 47 | think | 17 |
| 48 | state | 16 |
| 49 | has | 16 |
| 50 | columbia | 16 |
