In [1]:
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path

# load spaCy model (NER-capable)
nlp = spacy.load("en_core_web_sm")

print("spaCy OK:", spacy.__version__)
print("NetworkX OK:", nx.__version__)


  import pkg_resources


spaCy OK: 3.4.3
NetworkX OK: 2.8.8


In [2]:
# Path sanity check (optional)
import os
print("CWD:", os.getcwd())

# Load the text
text_path = Path("20th_century_events.txt")
assert text_path.exists(), "Couldn't find 20th_century_events.txt in this folder."
text = text_path.read_text(encoding="utf-8")

print("Characters in text:", len(text))
print(text[:600])  # quick preview


CWD: /Users/renatabatista/Other Docs/Germany/CareerFoundry/Data Specialization/JupyterLab/20th-century
Characters in text: 96901
This is a timeline of the 20th century .
1900s
1901
January 1 : The Australian colonies federate .
January 22 : Edward VII became King of England and India after Queen Victoria 's death.
March 2 : The Platt Amendment provides for Cuban independence in exchange for the withdrawal of American troops.
June : Emily Hobhouse reports on the poor conditions in 45 British internment camps for Boer women and children in South Africa .
September 6 : The assassination of William McKinley ushered in office Vice President Theodore Roosevelt after McKinley's death on September 14.
September 7 : The Eight-Na


In [3]:
import re, unicodedata
from collections import Counter
from pathlib import Path

src = Path("20th_century_events.txt")
text = src.read_text(encoding="utf-8")

# Non-ASCII characters present?
non_ascii = sorted({ch for ch in text if ord(ch) > 127})
print("Unique non-ASCII chars:", non_ascii[:50])
print("Count non-ASCII:", sum(ord(ch)>127 for ch in text))

# Show the 15 most common weird whitespace/hyphen characters, if any
odd_chars = ['\u00A0','\u2009','\u2010','\u2011','\u2013','\u2014','\u2018','\u2019','\u201C','\u201D','\u2022','\u2212']
present = [(repr(c), text.count(c)) for c in odd_chars if c in text]
print("Odd char counts:", present)

# Strip refs like [1], [2]...
refs = re.findall(r"\[\d+\]", text)
print("Reference markers found:", len(refs))

# Sample of sentences that include years (quick sanity peek)
print("\nSample lines:")
for line in text.splitlines()[:8]:
    print(" •", line)


Unique non-ASCII chars: ['½', 'É', 'Ö', 'Ø', 'á', 'â', 'é', 'í', 'ó', 'ö', 'ú', 'ü', 'ć', 'ę', 'ł', 'ń', 'ō', 'š', 'ș', '–']
Count non-ASCII: 243
Odd char counts: [("'–'", 193)]
Reference markers found: 0

Sample lines:
 • This is a timeline of the 20th century .
 • 1900s
 • 1901
 • January 1 : The Australian colonies federate .
 • January 22 : Edward VII became King of England and India after Queen Victoria 's death.
 • March 2 : The Platt Amendment provides for Cuban independence in exchange for the withdrawal of American troops.
 • June : Emily Hobhouse reports on the poor conditions in 45 British internment camps for Boer women and children in South Africa .
 • September 6 : The assassination of William McKinley ushered in office Vice President Theodore Roosevelt after McKinley's death on September 14.


In [4]:
import re

def clean_text(s: str) -> str:
    # Normalize unicode to NFC
    s = unicodedata.normalize("NFC", s)

    # Replace common typography with ASCII equivalents
    replacements = {
        "\u2018":"'", "\u2019":"'", "\u201C":'"', "\u201D":'"',
        "\u2013":"-",  "\u2014":"-",  "\u2212":"-",
        "\u00A0":" ",  "\u2009":" ",  "\u2010":"-", "\u2011":"-",
        "\u2022":"•",  # keep bullets but normalize
    }
    for k,v in replacements.items():
        s = s.replace(k,v)

    # Remove citation markers like [12], [a], [clarification needed]
    s = re.sub(r"\[[^\]]*?\]", "", s)

    # Collapse multiple spaces/newlines
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)

    # Trim spaces before punctuation
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)

    # Ensure year headings like "1901" stand alone (optional formatting)
    s = re.sub(r"\n(\d{4})(?!\d)", r"\n\1", s)

    return s.strip()

clean = clean_text(text)

print("Before chars:", len(text), "After chars:", len(clean))
print("Preview:\n", clean[:600])


Before chars: 96901 After chars: 95108
Preview:
 This is a timeline of the 20th century.
1900s
1901
January 1: The Australian colonies federate.
January 22: Edward VII became King of England and India after Queen Victoria 's death.
March 2: The Platt Amendment provides for Cuban independence in exchange for the withdrawal of American troops.
June: Emily Hobhouse reports on the poor conditions in 45 British internment camps for Boer women and children in South Africa.
September 6: The assassination of William McKinley ushered in office Vice President Theodore Roosevelt after McKinley's death on September 14.
September 7: The Eight-Nation Alli


After run a quick diagnosis and check for odd characters,I cleaned the text and preserved the mean.

In [5]:
# Minimal example: extend with your existing alias dict from earlier
countries = [
    "united states","united kingdom","germany","france","italy","spain",
    "russia","soviet union","china","japan","india","canada","australia",
    "poland","austria","hungary","netherlands","belgium","switzerland",
    "sweden","norway","denmark","finland","greece","portugal","ireland",
    "czechoslovakia","yugoslavia","turkey","egypt","iran","iraq","israel",
    "mexico","brazil","argentina","chile","south africa","pakistan",
    "south korea","north korea","vietnam","indonesia","philippines","thailand"
]

aliases = {
    "usa":"united states","u.s.":"united states","u.s.a.":"united states","america":"united states",
    "uk":"united kingdom","u.k.":"united kingdom","britain":"united kingdom","great britain":"united kingdom","england":"united kingdom",
    "ussr":"soviet union","u.s.s.r.":"soviet union","union of soviet socialist republics":"soviet union",
    "west germany":"germany","east germany":"germany","frg":"germany","gdr":"germany",
    "prc":"china","people's republic of china":"china","roc":"china","republic of china":"china",
    "ottoman empire":"turkey","ottoman":"turkey",
    "siam":"thailand","burma":"myanmar","ceylon":"sri lanka","holland":"netherlands",
}

# normalize once
clean_lc = clean.lower()

# replace aliases first (longest first avoids partial overlaps)
for alias, canon in sorted(aliases.items(), key=lambda kv: len(kv[0]), reverse=True):
    clean_lc = re.sub(rf"\b{re.escape(alias)}\b", canon, clean_lc, flags=re.IGNORECASE)

# show which canonicals are present at least once
present = [c for c in countries if re.search(rf"\b{re.escape(c)}\b", clean_lc)]
missing  = [c for c in countries if c not in present]

print("Countries found:", len(present))
print(sorted(present)[:20], "...")
print("Countries not found (check aliases/casing):", missing[:20])


Countries found: 44
['argentina', 'australia', 'austria', 'belgium', 'brazil', 'canada', 'chile', 'china', 'czechoslovakia', 'denmark', 'egypt', 'finland', 'france', 'germany', 'greece', 'hungary', 'india', 'indonesia', 'iran', 'iraq'] ...
Countries not found (check aliases/casing): ['yugoslavia']


In [6]:
out = Path("20th_century_events_clean.txt")
out.write_text(clean, encoding="utf-8")
print("Saved:", out.resolve(), "| size:", out.stat().st_size, "bytes")


Saved: /Users/renatabatista/Other Docs/Germany/CareerFoundry/Data Specialization/JupyterLab/20th-century/20th_century_events_clean.txt | size: 95158 bytes


Data wrangling observations

- The raw Wikipedia text contained typographic quotes (“ ” ‘ ’), en/em dashes (– —), and non-breaking spaces. It also included citation markers like [12].
- I normalized the text to ASCII-friendly forms (quotes → ' / ", dashes → -), removed citation markers, and collapsed excess whitespace.
- Country names in the article appear under multiple variants (e.g., “UK”, “Britain”, “Great Britain”, “U.S.”, “America”, “USSR”). I normalized the text by replacing aliases with canonical names (e.g., uk → united kingdom, ussr → soviet union).
- After cleaning + alias replacement, I verified which countries from my lookup were present and added a few extra aliases where needed.

Outcome: 
the cleaned file 20th_century_events_clean.txt is now consistent, free of distracting typography/refs, and aligned with the canonical country names I’ll use for NER and the network graph.

In [7]:
import spacy
from pathlib import Path

# Use the small English model (already installed earlier)
nlp = spacy.load("en_core_web_sm")

# Read your cleaned file (use the raw file if you prefer)
text_path = Path("20th_century_events_clean.txt")  # or "20th_century_events.txt"
assert text_path.exists(), f"Missing file: {text_path.resolve()}"
text = text_path.read_text(encoding="utf-8")

len(text), text[:300]


(95108,
 "This is a timeline of the 20th century.\n1900s\n1901\nJanuary 1: The Australian colonies federate.\nJanuary 22: Edward VII became King of England and India after Queen Victoria 's death.\nMarch 2: The Platt Amendment provides for Cuban independence in exchange for the withdrawal of American troops.\nJune:")

In [8]:
# Create the spaCy Doc (this runs tokenization, POS, NER, etc.)
doc = nlp(text)

type(doc), len(doc)



(spacy.tokens.doc.Doc, 19720)

In [9]:
with nlp.select_pipes(enable=["ner"]):
    doc = nlp(text)


In [10]:
# Show a few entities with labels
[(ent.text, ent.label_) for ent in list(doc.ents)[:25]]


[('the 20th century', 'DATE'),
 ('1900s\n1901\nJanuary 1', 'DATE'),
 ('Australian', 'NORP'),
 ('January 22', 'DATE'),
 ('King of England', 'ORG'),
 ('India', 'GPE'),
 ("Queen Victoria 's", 'PERSON'),
 ('March 2', 'DATE'),
 ('Cuban', 'NORP'),
 ('American', 'NORP'),
 ('June', 'DATE'),
 ('45', 'CARDINAL'),
 ('British', 'NORP'),
 ('South Africa', 'GPE'),
 ('September 6', 'DATE'),
 ('William McKinley', 'PERSON'),
 ('Theodore Roosevelt', 'PERSON'),
 ('McKinley', 'GPE'),
 ('September 14', 'DATE'),
 ('September 7', 'DATE'),
 ('Eight', 'CARDINAL'),
 ('the Boxer Rebellion', 'ORG'),
 ('China', 'GPE'),
 ('December 10', 'DATE'),
 ('December 12', 'DATE')]

In [11]:
country_like = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"GPE","LOC","NORP"}]
len(country_like), country_like[:25]


(1146,
 [('Australian', 'NORP'),
  ('India', 'GPE'),
  ('Cuban', 'NORP'),
  ('American', 'NORP'),
  ('British', 'NORP'),
  ('South Africa', 'GPE'),
  ('McKinley', 'GPE'),
  ('China', 'GPE'),
  ('Cuba', 'GPE'),
  ('the United States', 'GPE'),
  ('British', 'NORP'),
  ('the United Kingdom', 'GPE'),
  ('Venezuelan', 'NORP'),
  ('Britain', 'GPE'),
  ('Germany', 'GPE'),
  ('Italy', 'GPE'),
  ('Venezuela', 'GPE'),
  ('Serbia', 'GPE'),
  ('Russia', 'GPE'),
  ('Bolsheviks', 'GPE'),
  ('Mensheviks', 'NORP'),
  ('Panama', 'GPE'),
  ('the United States', 'GPE'),
  ('Panama', 'GPE'),
  ('The Ottoman Empire', 'GPE')])

Focus on countries like entities. spaCy marks country/place names mostly as:
- GPE (countries/cities)
- LOC (locations)
- NORP (nationalities/religious/political groups: “French”, “Soviets”)

In [13]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add a sentencizer if not already in the pipeline
if "senter" not in nlp.pipe_names and "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

# Re-run NER on your text
with open("20th_century_events.txt", "r", encoding="utf-8") as f:
    text = f.read()

doc = nlp(text)

# Ensure sentences are now available
sents = list(doc.sents)
print("Number of sentences:", len(sents))
print("Example sentence:", sents[0].text)

# Extract GPE / LOC / NORP mentions
sent_entities = []
for i, s in enumerate(sents):
    ents = {ent.text for ent in s.ents if ent.label_ in {"GPE", "LOC", "NORP"}}
    if ents:
        sent_entities.append({
            "sent_id": i,
            "sentence": s.text,
            "entities": list(ents)
        })

sent_entities[:3]  # preview first 3


Number of sentences: 1003
Example sentence: This is a timeline of the 20th century .



[{'sent_id': 1,
  'sentence': '1900s\n1901\nJanuary 1 : The Australian colonies federate .\n',
  'entities': ['Australian']},
 {'sent_id': 2,
  'sentence': "January 22 : Edward VII became King of England and India after Queen Victoria 's death.\n",
  'entities': ['India']},
 {'sent_id': 3,
  'sentence': 'March 2 : The Platt Amendment provides for Cuban independence in exchange for the withdrawal of American troops.\n',
  'entities': ['Cuban', 'American']}]

## Entities into a DF

In [14]:
import pandas as pd

ner_df = pd.DataFrame(sent_entities)  # list of dicts: sent_id, sentence, entities(list)
ner_df.head(10)
print("Rows with at least one entity:", len(ner_df))


Rows with at least one entity: 633


In [15]:
ner_long = (
    ner_df
    .explode("entities", ignore_index=True)
    .rename(columns={"entities": "entity"})
)
ner_long.head(10)
print("Total entity mentions (sentence-level):", len(ner_long))


Total entity mentions (sentence-level): 1144


In [16]:
from itertools import combinations

# helper: unique, case-normalized entities per sentence (avoid duplicates like "UK" and "United Kingdom" if you normalized earlier)
def norm(x): 
    return x.strip()

pairs = []
for row in ner_df.itertuples(index=False):
    ents = sorted({norm(e) for e in row.entities})
    if len(ents) >= 2:
        pairs.extend(list(combinations(ents, 2)))  # unordered pairs within the sentence

edges_df = (
    pd.DataFrame(pairs, columns=["src", "dst"])
    .value_counts()
    .reset_index(name="weight")
    .sort_values("weight", ascending=False)
)
edges_df.head(15)
print("Unique edges:", len(edges_df))


Unique edges: 827


In [17]:
# Example: keep only entities that match your canonical set
canonical = set([
    "united states","united kingdom","germany","france","italy","spain",
    "russia","soviet union","china","japan","india","canada","australia",
    "poland","austria","hungary","netherlands","belgium","switzerland",
    "sweden","norway","denmark","finland","greece","portugal","ireland",
    "czechoslovakia","yugoslavia","turkey","egypt","iran","iraq","israel",
    "mexico","brazil","argentina","chile","south africa","pakistan",
    "south korea","north korea","vietnam","indonesia","philippines","thailand"
])

# Lowercase columns and filter
edges_df_lc = edges_df.assign(
    src=edges_df["src"].str.lower(),
    dst=edges_df["dst"].str.lower()
)
edges_countries = edges_df_lc[
    edges_df_lc["src"].isin(canonical) & edges_df_lc["dst"].isin(canonical)
].copy()

edges_countries.sort_values("weight", ascending=False).head(15)
print("Country-only edges:", len(edges_countries))


Country-only edges: 46


In [18]:
ner_df.to_csv("ner_sentences.csv", index=False)
ner_long.to_csv("ner_mentions_long.csv", index=False)
edges_df.to_csv("entity_cooccurrence_edges.csv", index=False)
edges_countries.to_csv("country_cooccurrence_edges.csv", index=False)

print("Saved: ner_sentences.csv, ner_mentions_long.csv, entity_cooccurrence_edges.csv, country_cooccurrence_edges.csv")


Saved: ner_sentences.csv, ner_mentions_long.csv, entity_cooccurrence_edges.csv, country_cooccurrence_edges.csv


In [19]:
import re
import spacy
from itertools import combinations
import pandas as pd

# Ensure sentencizer exists so doc.sents works
if "senter" not in nlp.pipe_names and "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

# ---- canonical list (lowercase) ----
countries = [
    "united states","united kingdom","germany","france","italy","spain",
    "russia","soviet union","china","japan","india","canada","australia",
    "poland","austria","hungary","netherlands","belgium","switzerland",
    "sweden","norway","denmark","finland","greece","portugal","ireland",
    "czechoslovakia","yugoslavia","turkey","egypt","iran","iraq","israel",
    "mexico","brazil","argentina","chile","south africa","pakistan",
    "south korea","north korea","vietnam","indonesia","philippines","thailand"
]
countries_set = set(countries)

# ---- aliases -> canonical (all lowercase) ----
aliases = {
    "usa":"united states","u.s.":"united states","u.s.a.":"united states","america":"united states",
    "uk":"united kingdom","u.k.":"united kingdom","britain":"united kingdom","great britain":"united kingdom","england":"united kingdom",
    "ussr":"soviet union","u.s.s.r.":"soviet union","union of soviet socialist republics":"soviet union",
    "west germany":"germany","east germany":"germany","frg":"germany","gdr":"germany",
    "prc":"china","people's republic of china":"china","roc":"china","republic of china":"china",
    "ottoman empire":"turkey","ottoman":"turkey",
    "siam":"thailand","holland":"netherlands",
}

# Normalize an entity string to its canonical country (or None if not a country)
def canon_country(s: str) -> str | None:
    x = s.strip().lower()
    # replace aliases first (longest aliases first to avoid partial overlaps)
    for alias, canon in sorted(aliases.items(), key=lambda kv: len(kv[0]), reverse=True):
        if re.fullmatch(rf"{re.escape(alias)}", x):
            x = aliases[alias]
            break
    return x if x in countries_set else None


In [20]:
# Sentence list
sents = list(doc.sents)

# For each sentence, keep only named entities of type GPE/LOC/NORP (place-like)
sent_entities = []
for i, s in enumerate(sents):
    ents = [ent.text for ent in s.ents if ent.label_ in {"GPE","LOC","NORP"}]
    if ents:
        sent_entities.append({"sent_id": i, "sentence": s.text, "raw_entities": ents})

len(sent_entities), sent_entities[:2]


(633,
 [{'sent_id': 1,
   'sentence': '1900s\n1901\nJanuary 1 : The Australian colonies federate .\n',
   'raw_entities': ['Australian']},
  {'sent_id': 2,
   'sentence': "January 22 : Edward VII became King of England and India after Queen Victoria 's death.\n",
   'raw_entities': ['India']}])

In [21]:
# Map sentence entities -> canonical countries; drop non-countries
for row in sent_entities:
    canon_ents = {c for e in row["raw_entities"] if (c := canon_country(e))}
    row["countries"] = sorted(canon_ents)

# Keep sentences that actually contain >= 2 distinct countries (useful for relationships)
sent_with_countries = [r for r in sent_entities if len(r["countries"]) >= 1]
len(sent_with_countries), sent_with_countries[:2]


(267,
 [{'sent_id': 2,
   'sentence': "January 22 : Edward VII became King of England and India after Queen Victoria 's death.\n",
   'raw_entities': ['India'],
   'countries': ['india']},
  {'sent_id': 4,
   'sentence': "June : Emily Hobhouse reports on the poor conditions in 45 British internment camps for Boer women and children in South Africa .\nSeptember 6 : The assassination of William McKinley ushered in office Vice President Theodore Roosevelt after McKinley's death on September 14.\nSeptember 7 : The Eight-Nation Alliance defeats the Boxer Rebellion , and imposes heavy financial penalties on China .\n",
   'raw_entities': ['British', 'South Africa', 'McKinley', 'China'],
   'countries': ['china', 'south africa']}])

## Create the relationships dataframe (country co-occurrences)
- Within each sentence, we connect every unique pair of countries.
- Then we aggregate across all sentences to get an edge weight.

In [22]:
pairs = []
for r in sent_with_countries:
    # unique per sentence
    cs = sorted(set(r["countries"]))
    # choose 2 for undirected pairs; if only one country, skip
    if len(cs) >= 2:
        pairs += list(combinations(cs, 2))

# Build edge table with weights
edges_df = (
    pd.DataFrame(pairs, columns=["source","target"])
      .value_counts()
      .reset_index(name="weight")
      .sort_values("weight", ascending=False)
      .reset_index(drop=True)
)

edges_df.head(15), len(edges_df)


(     source          target  weight
 0   germany  united kingdom       4
 1      iran   united states       3
 2    france  united kingdom       3
 3     china           japan       2
 4   austria          sweden       2
 5   finland          sweden       2
 6   austria         hungary       2
 7   denmark          norway       2
 8   austria         finland       2
 9    france         germany       2
 10   poland          russia       2
 11   france     netherlands       1
 12  austria          russia       1
 13  belgium         denmark       1
 14  belgium          france       1,
 57)

In [23]:
# Node degrees (weighted)
node_weights = (
    edges_df
      .assign(pair_weight=lambda d: d["weight"])
      .melt(id_vars="weight", value_vars=["source","target"], value_name="country")
      .groupby("country", as_index=False)["weight"].sum()
      .rename(columns={"weight":"degree_weight"})
      .sort_values("degree_weight", ascending=False)
)
node_weights.head(10)


Unnamed: 0,country,degree_weight
9,france,12
31,united kingdom,12
10,germany,12
0,austria,10
4,china,9
32,united states,8
6,denmark,7
21,poland,6
28,sweden,6
19,norway,6


## Save & export your dataframe(s)

In [24]:
edges_path = "country_relationships_edges.csv"
nodes_path = "country_relationships_nodes.csv"
sent_path  = "sentences_with_countries.csv"

pd.DataFrame(sent_with_countries).to_csv(sent_path, index=False)
edges_df.to_csv(edges_path, index=False)
node_weights.to_csv(nodes_path, index=False)

print("Saved:")
print(" •", sent_path)
print(" •", edges_path)
print(" •", nodes_path)


Saved:
 • sentences_with_countries.csv
 • country_relationships_edges.csv
 • country_relationships_nodes.csv


- sentences_with_countries.csv – each sentence + the countries found there
- country_relationships_edges.csv – (source, target, weight) for co-occurrences
- country_relationships_nodes.csv – node list with a simple degree_weight (sum of connected edge weights)