In [148]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [149]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 4.0 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 4.6 MB/s eta 0:00:03
     ------------ --------------------------- 3.9/12.8 MB 5.5 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 6.6 MB/s eta 0:00:02
     ----------------------------- ---------- 9.4/12.8 MB 8.3 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 9.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [150]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Loading 20th Century Events Page

In [190]:
# Importing text file of twentieth-century data scraped 

with open ('20th_century_Events_Wiki.txt','r',errors='ignore') as file:
    data=file.read().replace('\n','')

# Removing special or unnecessary characters in wiki text
data_cleaned = re.sub(r'\d+\.\d+|\W+', ' ', data)

# Exporting cleaned wiki text
path = "C:/Users/Drew/20th_century/cleaned_20th_century_Events_Wiki.txt"
with open(path, 'w', encoding='utf-8') as file:
    file.write(data_cleaned)

In [191]:
wiki=NER(data_cleaned)

In [264]:
# Visualizing certain section  of the identified entities in the text for representation

displacy.render(wiki[273:750], style = "ent", jupyter = True)

### Text & country list evaluation
     I identified unnecessary characters within the wiki text that could be removed to aid in the wrangling process. To address this, I cleaned the data, removing punctuations and symbols, and saved a "cleaned" version of the text. 
    Further evaluation showed that some country names in the text differed from their common or standardized formats. For example, the text referred to "Korea, South," whereas the standardized format in my data uses "South Korea." To maintain consistency with the wiki text, I renamed countries like this to match the exact format used in the text.
    Additionally, I observed that both "Russia" and the "Soviet Union" were referenced at different points in the text, reflecting historical transitions. As a result, I included the "Soviet Union" as a separate entity in the country list to ensure accurate representation of historical events from the 20th century. This allows the analysis to capture the references to both Russia and the Soviet Union during the transition period.
    Finally, the wiki text frequently referred to "Africa" as a whole, whereas my country list separated specific regions within Africa. To align with the text, I decided to combine all African regions into a single entry, "Africa," ensuring proper consistency with the text's format.

## Obtaining list of named entities per sentence

In [193]:
df_sentences=[]

# Looping sentences to get entity list for each sentence

for sent in wiki.sents:
    entity_list=[ent.text for ent in sent.ents]
    df_sentences.append({"sentence":sent,"entities":entity_list})

df_sentences=pd.DataFrame(df_sentences)

In [194]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, Wikipedi...","[the 20th century, WikipediaJump, Navigation M..."
1,"(fileSearchSearchDonateAppearanceCreate, accou...","[1Historic, 20th, the 20th century, the beginn..."
2,"(The, war, to, end, all, wars, World, War, I, ...","[World War I, 2The, World War II, 1939â 1945, ..."
3,"(7The, war, in, the, Pacific, , Background, ...","[Pacific, 10Final days ]"
4,"(11The, Holocaust, , 12The, Nuclear, Age, beg...","[11The, Holocaust, 1The, 2The, Cold War, 1947â..."
5,"(5The, end, of, the, Cold, War, , 6Informatio...","[5The end, the Cold War, 6Information, 20th, Ù..."
6,"(What, links, hereRelated, changesUpload, file...",[]
7,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Print, Download as PDFPrintable,..."
8,"(The, Allies, known, initially, as, The, Tripl...","[The Triple Entente, the British Empire, Franc..."
9,"(The, British, first, used, the, tank, 10, Bot...","[British, 10, 11, Austria Hungary, World War I..."


## Loading Country Names

In [187]:
# Loading country list file
country_df = pd.read_csv(r"C:/Users/Drew/20th_century/country_wrangled.csv", encoding='utf-8')

In [188]:
country_df.head()

Unnamed: 0,country_name,country
0,Afghanistan,Afghanistan
1,Albania,Albania
2,Algeria,Algeria
3,Andorra,Andorra
4,Angola,Angola


In [196]:
# Add Soviet Union
new_row = pd.DataFrame([{'country_name': 'Soviet Union', 'country': 'Soviet Union'}])
country_df = pd.concat([country_df, new_row], ignore_index=True)

# Remove duplicate rows
country_df = country_df.drop_duplicates(subset=['country_name', 'country'], keep='first')

# Update Korea, North to North Korea and Korea, South to South Korea
country_df['country_name'] = country_df['country_name'].replace({
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea'})
country_df['country'] = country_df['country'].replace({
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea'})

# Remove Central African Republic and South Africa
country_df = country_df[~country_df['country'].isin(['Central African Republic', 'South Africa'])]

# Add Africa as a new entry
new_africa_row = pd.DataFrame([{'country_name': 'Africa', 'country': 'Africa'}])
country_df = pd.concat([country_df, new_africa_row], ignore_index=True)

# Remove any remaining duplicates again, if necessary
country_df = country_df.drop_duplicates(subset=['country_name', 'country'], keep='first')

# Display the updated DataFrame
print(country_df)

# Export the updated DataFrame to a CSV file
country_df.to_csv(r"C:\Users\Drew\20th_century\country_wrangled.csv", index=False)

          country_name        country
0         Afghanistan     Afghanistan
1             Albania         Albania
2             Algeria         Algeria
3             Andorra         Andorra
4              Angola          Angola
..                 ...            ...
204     South Ossetia   South Ossetia
205            Taiwan          Taiwan
206       Transnistria   Transnistria
207       Soviet Union   Soviet Union
208             Africa         Africa

[209 rows x 2 columns]


In [197]:
country_df.tail(10)

Unnamed: 0,country_name,country
199,Luhansk People's Republic,Luhansk People's Republic
200,Niue,Niue
201,Northern Cyprus,Northern Cyprus
202,Sahrawi Arab Democratic Republic,Sahrawi Arab Democratic Republic
203,Somaliland,Somaliland
204,South Ossetia,South Ossetia
205,Taiwan,Taiwan
206,Transnistria,Transnistria
207,Soviet Union,Soviet Union
208,Africa,Africa


## Filtering entities from wiki text

In [200]:
# Filtering out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['country'])]

In [201]:
# Checking entity filter

filter_entity(["Germany","Africa","Korea","South Korea"],country_df)

['Germany', 'Africa', 'South Korea']

In [217]:
# Define replacements for multi-word country names
replacements = {
    "United States": "United_States",
    "United Kingdom": "United_Kingdom",
    "North Korea": "North_Korea",
    "South Korea": "South_Korea"}

# Apply replacements to the country entities after filtering
df_sentences['country_entities'] = df_sentences['country_entities'].apply(
    lambda x: [replacements.get(ent, ent) for ent in x])

In [218]:
df_sentences['country_entities'].head(20)

0                                                    []
1                                                    []
2                                                    []
3                                                    []
4                                                    []
5                                                    []
6                                                    []
7                                                    []
8     [France, Russia, Germany, Austria, Hungary, Bu...
9                                             [Germany]
10                                                   []
11                                                   []
12                                                   []
13                                                   []
14                                                   []
15                                                   []
16                                              [Italy]
17                                              

In [219]:
# Filtering out sentences that do not have country entities

df_sentences_filtered=df_sentences[df_sentences['country_entities'].map(len)>0]

In [220]:
df_sentences_filtered.tail(20)

Unnamed: 0,sentence,entities,country_entities
147,"(Zimmerman, Dwight, Charles, de, Gaulle, and, ...","[Zimmerman Dwight Charles de Gaulle, 12 Decemb...",[Italy]
148,"(World, Dunya, News, dunyanews, tv, 14, Februa...","[14 February 2008, 12 December 2018, Italy Tri...",[Greece]
156,"(Retrieved, 12, December, 2018, Hickman, Kenne...","[12 December 2018, Hickman Kennedy, 19 Septemb...","[Africa, Germany, Germany]"
163,"(The, US, Invasion, of, Italy, The, National, ...",[The US Invasion of Italy The National WWII Mu...,[Italy]
175,"(The, Problem, of, Sovereignty, Manchukuo, 193...","[The Problem of Sovereignty Manchukuo, 1932 19...",[Japan]
177,"(Second, Sino, Japanese, War, 1937, 45, Anti, ...","[Second Sino Japanese War 1937, Anti Japanese ...","[Italy, Japan]"
180,"(The, United, States, declares, war, on, Japan...","[The United States, Japan, 16]",[Japan]
184,"(The, History, Place, Timeline, of, Pacific, W...","[The History Place Timeline of Pacific War, 13...",[Philippines]
197,"(Ojo, Marvellous, 26, August, 2018, Soviet, In...","[26 August 2018, Manchuria Finishing, the Japa...",[Japan]
205,"(The, System, Two, new, histories, show, how, ...","[Two, Nazi, The New Yorker Retrieved, 15 Decem...",[Israel]


In [221]:
# Creating full copy of the filtered DataFrame to avoid any warning
df_sentences_filtered = df_sentences_filtered.copy()

# Now apply the operation to take only the first name of the characters
df_sentences_filtered['country_entities'] = df_sentences_filtered['country_entities'].apply(lambda x: [item.split()[0] for item in x])
df_sentences_filtered.tail(20)

Unnamed: 0,sentence,entities,country_entities
147,"(Zimmerman, Dwight, Charles, de, Gaulle, and, ...","[Zimmerman Dwight Charles de Gaulle, 12 Decemb...",[Italy]
148,"(World, Dunya, News, dunyanews, tv, 14, Februa...","[14 February 2008, 12 December 2018, Italy Tri...",[Greece]
156,"(Retrieved, 12, December, 2018, Hickman, Kenne...","[12 December 2018, Hickman Kennedy, 19 Septemb...","[Africa, Germany, Germany]"
163,"(The, US, Invasion, of, Italy, The, National, ...",[The US Invasion of Italy The National WWII Mu...,[Italy]
175,"(The, Problem, of, Sovereignty, Manchukuo, 193...","[The Problem of Sovereignty Manchukuo, 1932 19...",[Japan]
177,"(Second, Sino, Japanese, War, 1937, 45, Anti, ...","[Second Sino Japanese War 1937, Anti Japanese ...","[Italy, Japan]"
180,"(The, United, States, declares, war, on, Japan...","[The United States, Japan, 16]",[Japan]
184,"(The, History, Place, Timeline, of, Pacific, W...","[The History Place Timeline of Pacific War, 13...",[Philippines]
197,"(Ojo, Marvellous, 26, August, 2018, Soviet, In...","[26 August 2018, Manchuria Finishing, the Japa...",[Japan]
205,"(The, System, Two, new, histories, show, how, ...","[Two, Nazi, The New Yorker Retrieved, 15 Decem...",[Israel]


## Creating relationships between countries in text

In [256]:
relationships = []
for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    country_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'], [])
    
    # Remove duplicate country names that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list))
                      if (i == 0) or country_list[i] != country_list[i - 1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [257]:
relationship_df=pd.DataFrame(relationships)

In [258]:
relationship_df.head()

Unnamed: 0,source,target
0,France,Russia
1,Russia,Germany
2,Germany,Austria
3,Austria,Hungary
4,Hungary,Bulgaria


In [259]:
# Define replacements for multi-word country names
replacements = {
    "United States": "United_States",
    "United Kingdom": "United_Kingdom",
    "North Korea": "North_Korea",
    "South Korea": "South_Korea",
    # Add other replacements as needed
}

# Apply replacements to both source and target columns
relationship_df['source'] = relationship_df['source'].replace(replacements)
relationship_df['target'] = relationship_df['target'].replace(replacements)

# Remove underscores from the source and target columns
relationship_df['source'] = relationship_df['source'].str.replace('_', ' ')
relationship_df['target'] = relationship_df['target'].str.replace('_', ' ')

# Sorting the relationships
relationship_df=pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(10)

Unnamed: 0,source,target
0,France,Russia
1,Germany,Russia
2,Austria,Germany
3,Austria,Hungary
4,Bulgaria,Hungary
5,Bulgaria,Russia
6,Germany,Russia
7,Germany,Russia
8,Germany,Russia
9,France,Russia


In [260]:
# Count sum of relationships between country occurances
relationship_df['interaction_count'] = 1
relationship_df = relationship_df.groupby(['source', 'target'], sort=False, as_index=False).sum()

In [261]:
relationship_df.tail(20)

Unnamed: 0,source,target,interaction_count
93,North Korea,South Korea,11
94,Australia,South Korea,6
95,Australia,Vietnam,4
96,Cuba,United States,6
97,Russia,United States,4
98,Canada,Japan,6
99,Canada,Lithuania,4
100,Lithuania,Russia,6
101,Africa,China,7
102,India,North Korea,3


### Exporting relationship dataframe

In [262]:
# exporting relationship_df

relationship_df.to_csv(r"C:\Users\Drew\20th_century\country_relationship_count.csv", index=False)