In [147]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Load 20th_century key events

In [148]:
# Load the key events .txt file
with open('Key_events_20th_century.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

In [149]:
#
NER = spacy.load("en_core_web_sm")
book = NER(data)

In [150]:
# Visualize identified entities
displacy.render(book[273:20000], style = "ent", jupyter = True)

# Splitting Sentence Entities

In [151]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [152]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , The, 20th, century, changed, the, world,...",[The 20th century]
1,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(Historic, events, in, the, 20th, century, , ...","[the 20th century World, the beginning of the..."
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
5,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"
6,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
7,"(The, First, World, War, (, or, simply, WWI, )...","[The First World War, WWI, The Great War, July..."
8,"(The, war, was, precipitated, by, the, Assassi...","[Erzherzog Franz Ferdinand, Gavrilo Princip, Y..."
9,"(After, a, period, of, diplomatic, and, milita...","[the July Crisis, the end of July 1914, Britis..."


In [153]:
country_df = pd.read_csv("country_aliases.csv", index_col = 0)

In [154]:
country_df.head()

Unnamed: 0_level_0,Aliases
Country,Unnamed: 1_level_1
Abkhazia,abkhazia
Afghanistan,afghanistan
Albania,albania
Algeria,algeria
Andorra,andorra


# Filtering Data

In [155]:
# Define aliases for country names
#aliases = {
#    "united states": ["usa", "america", "u.s.", "united states of america"],
#    "united kingdom": ["uk", "britain", "england", "great britain"],
#    "south korea": ["korea", "republic of korea"],
#    "north korea": ["dprk"],
#    "china": ["prc", "peoples republic of china"],
 #   "russia": ["soviet union", "ussr"],
#}

In [156]:
# Function to filter out entities not of interest
#def filter_entity(ent_list, country_df, alias_col="Aliases"):
#    alias_set = set(country_df[alias_col].dropna().str.strip().str.lower())
#    return [ent for ent in ent_list if ent.lower().strip() in alias_set]

In [157]:
#print(country_df.columns)

In [158]:
#filtered = filter_entity(["united states", "russia", "united kingdom", ""], country_df)
#print(filtered[:5])

In [159]:
df_sentences['country_entities'] = df_sentences['entities'].apply(
    lambda x: filter_entity(x, country_df)
)

In [160]:
# Filter out sentences that don’t have any character entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1136,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1141,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, India, Pakistan, 70 ...","[India, Pakistan]"
1151,"("", The, Philippines, ,, 1898–1946, |, US, Hou...","[Philippines, 1898–1946, US House of Represent...",[Philippines]
1185,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Enduring Failures of ...",[Afghanistan]
1227,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...",[Romania]
1291,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1323,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1605,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, a Field Exper...",[Lebanon]
1610,"(The, Rise, of, China, and, India, :, A, New, ...","[The Rise of China, India]",[India]
1611,"(Singapore, :, World, Scientific, .)",[Singapore],[Singapore]


In [161]:
# Keep full names
df_sentences_filtered.loc[:, 'country_entities'] = df_sentences_filtered['country_entities'].apply(
    lambda x: [item.lower().strip() for item in x]
)

In [162]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1136,"("", The, forgotten, violence, that, helped, In...",[India],[india]
1141,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, India, Pakistan, 70 ...","[india, pakistan]"
1151,"("", The, Philippines, ,, 1898–1946, |, US, Hou...","[Philippines, 1898–1946, US House of Represent...",[philippines]
1185,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Enduring Failures of ...",[afghanistan]
1227,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...",[romania]
1291,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[vietnam]
1323,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[vietnam]
1605,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, a Field Exper...",[lebanon]
1610,"(The, Rise, of, China, and, India, :, A, New, ...","[The Rise of China, India]",[india]
1611,"(Singapore, :, World, Scientific, .)",[Singapore],[singapore]


# Creating Relationships

In [163]:
# Define window size
window_size = 5

# Create an empty list for edges
relationships = []

# Use .iloc to iterate over the filtered rows by position (not index values)
for i in range(len(df_sentences_filtered)):
    end_i = min(i + window_size, len(df_sentences_filtered))
    
    # Flatten all country_entities in this window
    country_list = sum(df_sentences_filtered.iloc[i:end_i]['country_entities'].tolist(), [])
    
    # Remove consecutive duplicates
    country_unique = [country_list[j] for j in range(len(country_list)) 
                      if j == 0 or country_list[j] != country_list[j - 1]]
    
    # Record pairwise relationships
    if len(country_unique) > 1:
        for idx in range(len(country_unique) - 1):
            relationships.append({
                "source": country_unique[idx],
                "target": country_unique[idx + 1]
            })


In [164]:
relationship_df = pd.DataFrame(relationships)

In [165]:
relationship_df

Unnamed: 0,source,target
0,france,germany
1,germany,italy
2,germany,italy
3,italy,germany
4,germany,italy
...,...,...
617,lebanon,india
618,india,singapore
619,lebanon,india
620,india,singapore


In [166]:
# Sort source and target alphabetically (A-B or B-A become the same)
relationships_df = pd.DataFrame(
    np.sort(relationship_df[['source', 'target']].values, axis=1),
    columns=['source', 'target']
)

relationships_df.head()

Unnamed: 0,source,target
0,france,germany
1,germany,italy
2,germany,italy
3,germany,italy
4,germany,italy


In [167]:
relationships_df.to_csv("country_relationships_20th_century.csv", index=False)
print("country_relationships_20th_century.csv is saved")

country_relationships_20th_century.csv is saved


In [168]:
# Count how often each relationship appears
edge_counts = relationships_df.value_counts().reset_index(name='weight')

edge_counts

Unnamed: 0,source,target,weight
0,germany,japan,31
1,germany,poland,28
2,france,germany,28
3,germany,italy,25
4,india,japan,18
...,...,...,...
86,finland,poland,4
87,india,singapore,4
88,india,south africa,4
89,iran,israel,4


In [169]:
# Create the edge_counts DataFrame
edge_counts = relationships_df.value_counts().reset_index(name='weight')

In [170]:
edge_counts.to_csv("country_edge_counts.csv", index=False)
print("country_edge_counts.csv is now saved")

country_edge_counts.csv is now saved
