### Import Libraries

In [8]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [9]:
nlp = spacy.load("en_core_web_sm")

### Load Data

#### Wiki -------------------------------

In [10]:
# Load the wiki text
with open('20th_century_wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

wiki = nlp(data)

In [4]:
from IPython.display import display

In [12]:
# Visualize identified entities
displacy.render([wiki[273:20000]], style="ent", jupyter=True)

# does not work 

ImportError: cannot import name 'display' from 'IPython.core.display' (C:\Users\steve\anaconda3\envs\20th_century\Lib\site-packages\IPython\core\display.py)

In [14]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in wiki.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [15]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,Key events of the 20th century - Wikipedia...,[the 20th century - Wikipedia ...
1,1 Historic events in the 20th century Togg...,"[the 20th century, the 20th century, the begin..."
2,World War II (1939â€“1945) 1.3.1 The war...,"[World War II, 1.3.1, Europe ]"
3,Blitzkrieg 1.3.3 Operation Barbarossa ...,[Blitzkrieg 1.3.3]
4,Turning tides 1.3.5 Operation Overlord...,[the Pacific 1.3.7.1 Background ...
5,Allied offensive 1.3.10 Final days ...,[The Holocaust 1.3.12]
6,The Nuclear Age begins 1.4 The post-...,[1.4]
7,The end of empires: decolonization 1.4.2,[]
8,The Cold War (1947â€“1991) 1.4.3 War b...,"[The Cold War, 1.4.3 War, 1.4.4]"
9,The space race 1.4.5 The end of the Co...,[the Cold War ]


#### Countries List -------------------------------

In [29]:
# import countries text file
path = 'C:/Users/steve/20th-century'
countries_df = pd.read_csv(os.path.join(path, 'countries_list_20th_century.csv'))

In [30]:
countries_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


In [31]:
countries_df.drop('Unnamed: 0', axis=1, inplace=True)

In [32]:
countries_df['country_name'] = countries_df['country_name'].str.strip().str.replace(r'\s+', ' ', regex=True)

In [33]:
countries_df.head()

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


### Filter Data

In [34]:
# Function to filter out entities not of interest
def filter_entity(ent_list, countries_df):
       return [ent for ent in ent_list
                  if ent in list(countries_df['country_name'])]

In [35]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [36]:
# Filter out sentences that don’t have any country entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1134,"^ ""The division of Germany â€“ The Cold War (1...","[Germany, The Cold War]",[Germany]
1166,"""The forgotten violence that helped India brea...",[India],[India]
1170,"""Indian Independence Day: everything you need ...","[Indian Independence Day, Partition, India, Pa...","[India, Pakistan]"
1179,"^ ""The Philippines, 1898â€“1946 | US House of ...","[Philippines, 1898â€“1946, US House of Represe...",[Philippines]
1249,"The Moldovans: Romania, Russia, and the Politi...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1314,"""Selling 'Operation Passage to Freedom': Dr. T...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1349,"""Stuck in Endless Preliminaries: Vietnam and t...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1618,"""Anti-American Behavior in the Middle East: Ev...","[Anti-American, the Middle East, Lebanon]",[Lebanon]
1623,The Rise of China and India: A New Asian Drama.,"[China, India, New Asian Drama]",[India]
1624,Singapore: World Scientific.,"[Singapore, World Scientific]",[Singapore]


### Relationship Analysis

In [44]:
# Defining relationships with a sliding window
window_size = 5
relationships = []

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + window_size, df_sentences_filtered.index[-1])
    
    # Flatten the list of country entities in the window
    country_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'].tolist(), [])

    # Remove duplicates that are adjacent
    country_unique = [country_list[j] for j in range(len(country_list))
                      if j == 0 or country_list[j] != country_list[j - 1]]
    
    # If more than one unique country, define edges
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [45]:
relationship_df = pd.DataFrame(relationships)

relationship_df

Unnamed: 0,source,target
0,France,Austria
1,Austria,Russia
2,France,Austria
3,Austria,Russia
4,Russia,Germany
...,...,...
588,India,Singapore
589,India,Singapore
590,India,Singapore
591,India,Singapore


In [46]:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationships_df.head(5)

Unnamed: 0,source,target
0,Austria,France
1,Austria,Russia
2,Austria,France
3,Austria,Russia
4,Germany,Russia


In [48]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Austria,1
1,Austria,Russia,1
2,Russia,Germany,1
3,Germany,Russia,1
4,Germany,Italy,1
5,Italy,Germany,1
6,Germany,Austria,1
7,Austria,Germany,1
8,Germany,Spain,1
9,France,Poland,1


### Export Data

In [53]:
file_path = os.path.join(path, "relationships.csv")
relationship_df.to_csv(file_path, index=False)