In [1]:
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [3]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.8 kB/s[0m eta [36m0:00:00[0m00:05[0m00:11[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load Key events of 20th century

In [5]:
# Load the text

with open('20th_century_article_Wiki.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [6]:
text = NER(data)

In [7]:
print(len(text))

20324


In [8]:
# Visualize identified entities

displacy.render(text[273:20000], style = "ent", jupyter = True)

## Get named entity list per sentence

In [9]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in text.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [10]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,..."
1,"(accountLog, inPersonal, toolsDonate, Create, ...","[20th, the 20th century, World War I]"
2,"(depression1.2.2The, rise, of, dictatorship1.3...","[World War II, Pacific1.3.7.1Background1.3.8Ja..."
3,"(begins1.4The, post, -, war, world1.4.1The, en...","[Cold War, 1947–1991)1.4.3War]"
4,"(race1.4.5The, end, of, the, Cold, War1.4.6Inf...","[the Cold War1.4.6Information, 20th, pageGet, ..."
5,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
6,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
7,"(Historic, events, in, the, 20th, century[edit...",[the 20th]
8,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"
9,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"


## Load country list

In [11]:
# Import countries

country_df = pd.read_csv("countries_&_aliases.csv", index_col = 0)

In [12]:
country_df.head()

Unnamed: 0_level_0,aliases
Country,Unnamed: 1_level_1
Afghanistan,Afghanistan
Albania,Albania
Algeria,Algeria
Andorra,Andorra
Angola,Angola


In [13]:
country_df["new_alias"] = country_df['aliases'].apply(lambda x: x.split(',')[0].strip('[]').strip("''") if '[' in x else x)

In [14]:
country_df.drop(columns=['aliases'], inplace = True)
country_df.rename(columns={'new_alias':'aliases'}, inplace = True)

In [15]:
country_df

Unnamed: 0_level_0,aliases
Country,Unnamed: 1_level_1
Afghanistan,Afghanistan
Albania,Albania
Algeria,Algeria
Andorra,Andorra
Angola,Angola
...,...
Venezuela,Venezuela
Vietnam,Vietnam
Yemen,Yemen
Zambia,Zambia


## Filtering entities from the text

In [16]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['aliases'])]

In [17]:
# Check

filter_entity(["Afghanistan", "CF", "2"], country_df)

['Afghanistan']

In [18]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [19]:
df_sentences['country_entities'].head(20)

0                    []
1                    []
2                    []
3                    []
4                    []
5                    []
6                    []
7                    []
8                    []
9                    []
10                   []
11                   []
12                   []
13                   []
14                   []
15     [France, Russia]
16    [Germany, Russia]
17            [Germany]
18            [Germany]
19                   []
Name: country_entities, dtype: object

In [20]:
# Filter out sentences that don't have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [21]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
840,"("", The, forgotten, violence, that, helped, In...",[India],[India]
843,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, India, Pakistan, 70 ...","[India, Pakistan]"
851,"("", The, Philippines, ,, 1898–1946, |, US, Hou...","[Philippines, 1898–1946, US House of Represent...",[Philippines]
872,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Enduring Failures of ...",[Afghanistan]
898,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
942,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
964,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1151,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, a Field Exper...",[Lebanon]
1155,"(The, Rise, of, China, and, India, :, A, New, ...","[The Rise of China, India]",[India]
1156,"(Singapore, :, World, Scientific, .)",[Singapore],[Singapore]


In [22]:
# Take only the first name of the countries

df_sentences_filtered['country_entities'] = df_sentences_filtered['country_entities'].apply(lambda x: [item.split()[0]
                                                                                                    for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentences_filtered['country_entities'] = df_sentences_filtered['country_entities'].apply(lambda x: [item.split()[0]


## Create Relationships

In [23]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [24]:
relationship_df = pd.DataFrame(relationships)

In [25]:
# Set option to display all rows
pd.set_option('display.max_rows', None)

In [26]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Russia
4,France,Russia
5,Russia,Germany
6,Germany,Russia
7,Russia,Germany
8,France,Russia
9,Russia,Germany


In [27]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Germany,Russia
4,France,Russia


In [28]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [29]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,21
2,Germany,Italy,26
3,Austria,Germany,11
4,Germany,Spain,2
5,France,Spain,1
6,France,Poland,11
7,France,Germany,30
8,Germany,Poland,29
9,Estonia,Germany,5


In [30]:
relationship_df.to_csv('20th_century_relationship.csv')