In [1]:
import spacy
import pandas as pd

In [4]:
with open("20th_century_events.txt", "r") as file:
    text = file.read()

In [5]:
text[:500]

"The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs , the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. Historic events in the 20th century [ edit ] World at the beginning of the century [ edit ] Main article: Edwardian era"

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [7]:
[(ent.text, ent.label_) for ent in doc.ents[:20]]

[('The 20th century', 'DATE'),
 ('the Cold War', 'EVENT'),
 ('the Space Race', 'WORK_OF_ART'),
 ('the World Wide Web', 'WORK_OF_ART'),
 ('the 21st century', 'DATE'),
 ('today', 'DATE'),
 ('the 20th century', 'DATE'),
 ('the beginning of the century', 'DATE'),
 ('the 20th century', 'DATE'),
 ('The 1900s', 'DATE'),
 ('the decade', 'DATE'),
 ('the Panama Canal', 'ORG'),
 ('Scramble', 'GPE'),
 ('Africa', 'LOC'),
 ('the 1900s', 'DATE'),
 ('the Congo Free State', 'FAC'),
 ('1914 to 1918', 'DATE'),
 ('the First World War', 'EVENT'),
 ('World War I', 'EVENT'),
 ('World War I Arrest', 'EVENT')]

In [8]:
sentences = list(doc.sents)
sentences[:3]   

[The 20th century changed the world in unprecedented ways.,
 The World Wars sparked tension between countries and led to the creation of atomic bombs , the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created.,
 These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today.]

In [10]:
import spacy
import pandas as pd

# Load the text file
with open("20th_century_events.txt", "r") as file:
    text = file.read()

# Create the NER object
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Split into sentences
sentences = list(doc.sents)

# Extract countries LOC and GPE
countries = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
countries = list(set(countries))


relationships = []

for sent in sentences:
    sent_countries = [ent.text for ent in sent.ents if ent.label_ in ["GPE", "LOC"]]
    if len(sent_countries) > 1:
        relationships.append(sent_countries)

relationships

[['Scramble', 'Africa'],
 ['the British Empire',
  'France',
  'the Russian Empire',
  'German Empire',
  'Austria'],
 ['Russia', 'the Central Powers', 'Tsar'],
 ['Bolsheviks', 'Germany', 'Russia'],
 ['Germany', 'the Ottoman Empire'],
 ['New states', 'Yugoslavia', 'Czechoslovakia'],
 ['Germany', 'Italy'],
 ['Germany', 'Germany'],
 ['Germany', 'Central and Eastern Europe'],
 ['Herrenvolk', 'Untermensch'],
 ['Western Europe', 'the United States'],
 ['Austria', 'Austria', 'Germany'],
 ['Moscow', 'Czechoslovakia', 'Britain', 'France', 'Poland'],
 ['Britain', 'France', 'Germany', 'Poland'],
 ['Poland', 'East', 'Soviet Union', 'Nazi Germany'],
 ['Poland', 'the Soviet Union', 'USSR', 'Germany'],
 ['Estonia', 'Latvia', 'Lithuania', 'Eastern Poland'],
 ['Germany', 'the Soviet Union'],
 ['Denmark', 'Norway'],
 ['Norway', 'Denmark'],
 ['Sweden', 'Germany'],
 ['Belgium', 'Netherlands', 'Luxembourg'],
 ['France', 'Paris'],
 ['France', 'Atlantic'],
 ['the Battle of Britain Hitler', 'Great Britain'],

The text contains a few issues from the scrape, such as [ edit ] and long dashes. Some extracted items are not actual countries (e.g., Scramble, the Central Powers), and some country names include extra words like “the.” These inconsistencies need light cleaning to ensure accurate analysis.

In [11]:
clean_text = text.replace("[ edit ]", "")
clean_text = clean_text.replace("–", "-")

In [12]:
with open("20th_century_events_clean.txt", "w") as file:
    file.write(clean_text)

In [13]:
import spacy


with open("20th_century_events_clean.txt", "r") as file:
    text = file.read()

# Create  NER object
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Show a few entities
[(ent.text, ent.label_) for ent in doc.ents[:15]]

[('The 20th century', 'DATE'),
 ('the Cold War', 'EVENT'),
 ('the Space Race', 'WORK_OF_ART'),
 ('the World Wide Web', 'WORK_OF_ART'),
 ('the 21st century', 'DATE'),
 ('today', 'DATE'),
 ('the 20th century', 'DATE'),
 ('the beginning of the century', 'DATE'),
 ('Main', 'PRODUCT'),
 ('the 20th century', 'DATE'),
 ('The 1900s', 'DATE'),
 ('the decade', 'DATE'),
 ('the Panama Canal', 'ORG'),
 ('Scramble', 'GPE'),
 ('Africa', 'LOC')]

In [14]:
sentences = list(doc.sents)
sentences[:5]

[The 20th century changed the world in unprecedented ways.,
 The World Wars sparked tension between countries and led to the creation of atomic bombs , the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created.,
 These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today.,
 Historic events in the 20th century  World at the beginning of the century  Main article: Edwardian era The new beginning of the 20th century marked significant changes.,
 The 1900s saw the decade herald a series of inventions, including the automobile , airplane and radio broadcasting .]

In [15]:
filtered_entities = [ent for ent in doc.ents if ent.text in countries]
filtered_entities[:20]

[Scramble,
 Africa,
 Sarajevo,
 Sarajevo,
 the British Empire,
 France,
 the Russian Empire,
 the Central Powers,
 German Empire,
 Austria,
 Russia,
 the Central Powers,
 Tsar,
 Bolsheviks,
 Germany,
 Russia,
 Germany,
 the Ottoman Empire,
 Ukraine,
 Germany]

In [17]:
import pandas as pd

df_relationships = pd.DataFrame({"Countries_in_Sentence": relationships})
df_relationships.head()

Unnamed: 0,Countries_in_Sentence
0,"[Scramble, Africa]"
1,"[the British Empire, France, the Russian Empir..."
2,"[Russia, the Central Powers, Tsar]"
3,"[Bolsheviks, Germany, Russia]"
4,"[Germany, the Ottoman Empire]"


In [19]:
df_relationships.to_csv("country_relationships.csv", index=False)