# Import Libraries

In [33]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [34]:
NER = spacy.load("en_core_web_sm")

# Define Folder Directory for Path

In [4]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent   # because we're inside /notebooks
DATA_DIR = PROJECT_ROOT / "data"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"

DATA_DIR, OUTPUTS_DIR

(PosixPath('/Users/stephenhelvig/Documents/Python Projects/20th-century/data'),
 PosixPath('/Users/stephenhelvig/Documents/Python Projects/20th-century/outputs'))

In [6]:
text_path = DATA_DIR / "20th_century_key_events_clean.txt"
countries_path = DATA_DIR / "simplewiki_countries.csv"
events_csv_path = DATA_DIR / "20th_century_key_events.csv"

# Load Files

In [8]:
# Load cleaned text
with open(text_path, "r", encoding="utf-8", errors="ignore") as f:
    data = f.read()

# Normalize whitespace so sentence splitting is more consistent
data = re.sub(r"\s+", " ", data).strip()

# Load countries list
import pandas as pd
countries_df = pd.read_csv(countries_path)

print("Text length:", len(data))
countries_df.head()

Text length: 64959


Unnamed: 0,letter,country,url
0,A,Afghanistan,https://simple.wikipedia.org/wiki/Afghanistan
1,A,Albania,https://simple.wikipedia.org/wiki/Albania
2,A,Algeria,https://simple.wikipedia.org/wiki/Algeria
3,A,Andorra,https://simple.wikipedia.org/wiki/Andorra
4,A,Angola,https://simple.wikipedia.org/wiki/Angola


# Commments

**Text wrangling notes:** I normalized whitespace and used a cleaned .txt version because inconsistent spacing/punctuation can affect spaCy sentence splitting and NER entity boundaries.

**Country normalization:** Entities are filtered via a normalized string match against the SimpleWiki country list. I added an alias map for common variants (e.g., “U.S.”/“USA” → “United States”, “England/Britain” → “United Kingdom”, “USSR/Soviet Union” → “Russia”). This reduces missed matches due to naming variations.

**Relationship assumption:** I infer relationships using co-occurrence. If two countries appear within the same 5-sentence window, I treat that as an interaction and count it as an edge (then collapse A–B and B–A into an undirected pair).

In [9]:
events_doc = NER(data)

In [13]:
# Visualize identified entities

displacy.render(events_doc[273:20000], style = "ent", jupyter = True)

# Build the sentences DF

In [14]:
# Get named entities list per sentence

df_event_sentences = []

for sent in events_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_event_sentences.append({"sentence": str(sent), "entities": entity_list})

df_event_sentences = pd.DataFrame(df_event_sentences)
df_event_sentences.head()

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[The 20th century]
1,The World Wars sparked tension between countri...,"[The World Wars, the Cold War, the World Wide ..."
2,These advancements have played a significant r...,"[the 21st century, today]"
3,The new beginning of the 20th century marked s...,[the 20th century]
4,The 1900s saw the decade herald a series of in...,"[The 1900s, the decade]"


# Load Country Names for Filtering Entities

In [24]:
def norm(s: str) -> str:
    # lowercase, trim, remove most punctuation but keep spaces for multi-word names
    s = str(s).strip().lower()
    s = re.sub(r"[^a-z0-9\s\-\.'’]", "", s)  # keep letters/numbers/space/dash/dots/apostrophes
    s = re.sub(r"\s+", " ", s)
    # normalize curly apostrophe to straight
    s = s.replace("’", "'")
    return s

# Build lookup from countries list
country_names = countries_df["country"].dropna().astype(str).tolist()

# normalized -> canonical
country_map = {norm(c): c for c in country_names}

# High-impact aliases
extra_aliases = {
    "u.s.": "United States",
    "u.s": "United States",
    "us": "United States",
    "usa": "United States",
    "u.k.": "United Kingdom",
    "u.k": "United Kingdom",
    "uk": "United Kingdom",
    "england": "United Kingdom",
    "scotland": "United Kingdom",
    "wales": "United Kingdom",
    "northern ireland": "United Kingdom",
    "britain": "United Kingdom",
    "great britain": "United Kingdom",
    "ussr": "Russia",
    "u.s.s.r.": "Russia",
    "soviet union": "Russia",
}

for alias, canonical in extra_aliases.items():
    country_map[norm(alias)] = canonical

country_set = set(country_map.keys())

def filter_countries(ent_list):
    out = []
    for ent in ent_list:
        k = norm(ent)
        if k in country_set:
            out.append(country_map[k])
    return out

In [25]:
# Filter to countries

df_event_sentences["country_entities"] = df_event_sentences["entities"].apply(filter_countries)

df_event_sentences_filtered = df_event_sentences[df_event_sentences["country_entities"].map(len) > 0].copy()

# remove duplicates within a sentence (preserve order)
df_event_sentences_filtered["country_entities"] = df_event_sentences_filtered["country_entities"].apply(
    lambda x: list(dict.fromkeys(x))
)

df_event_sentences_filtered["country_entities"].explode().value_counts().head(25)

country_entities
Japan             26
Germany           24
United States     19
Russia            14
France            13
United Kingdom    11
Poland            10
Italy              8
China              7
India              5
North Korea        4
Vietnam            4
Lithuania          3
Norway             3
Libya              3
Philippines        3
Cuba               3
Austria            2
Hungary            2
Ukraine            2
Estonia            2
Latvia             2
Finland            2
Denmark            2
Greece             2
Name: count, dtype: int64

# Comments

NER-based entity counts differ from keyword counts because spaCy only counts items it tags as entities in sent.ents

# Create Relationships

In [27]:
window_size = 5
relationships = []

for i in range(len(df_event_sentences_filtered)):
    end_i = min(i + window_size, len(df_event_sentences_filtered) - 1)

    country_list = sum(
        df_event_sentences_filtered.iloc[i:end_i + 1]["country_entities"].tolist(),
        []
    )

    # remove adjacent duplicates
    country_unique = [
        country_list[j]
        for j in range(len(country_list))
        if j == 0 or country_list[j] != country_list[j - 1]
    ]

    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

relationship_df = pd.DataFrame(relationships)
relationship_df.head(15)

Unnamed: 0,source,target
0,France,Austria
1,Austria,Hungary
2,Hungary,Russia
3,Russia,Germany
4,Germany,Russia
5,Russia,Germany
6,Germany,Ukraine
7,Ukraine,Germany
8,Russia,Germany
9,Germany,Russia


In [28]:
# collapse A-B and B-A and count frequencies

relationships_sorted = pd.DataFrame(
    np.sort(relationship_df[["source", "target"]].values, axis=1),
    columns=["source", "target"]
)

relationships_sorted["value"] = 1

relationships_grouped = relationships_sorted.groupby(
    ["source", "target"], as_index=False, sort=False
)["value"].sum()

relationships_grouped.sort_values("value", ascending=False).head(20)

Unnamed: 0,source,target,value
41,Japan,United States,49
9,France,United Kingdom,34
43,Russia,United States,31
48,China,Japan,28
42,Germany,Japan,27
5,Germany,Italy,27
13,Germany,Poland,22
31,Germany,United Kingdom,21
3,Germany,Russia,19
14,Poland,Russia,17


In [29]:
relationships_grouped["value"].describe()

count    115.000000
mean       8.391304
std        7.045834
min        1.000000
25%        5.000000
50%        6.000000
75%       10.000000
max       49.000000
Name: value, dtype: float64

In [30]:
OUTPUTS_DIR.mkdir(exist_ok=True)
out_path = OUTPUTS_DIR / "20th_century_country_relationships.csv"
relationships_grouped.to_csv(out_path, index=False)
out_path

PosixPath('/Users/stephenhelvig/Documents/Python Projects/20th-century/outputs/20th_century_country_relationships.csv')