# 1.6 Intro to NLP and Network Analysis

## 2. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.9 MB/s eta 0:00:07
      --------------------------------------- 0.2/12.8 MB 1.6 MB/s eta 0:00:08
     - -------------------------------------- 0.5/12.8 MB 3.2 MB/s eta 0:00:04
     -- ------------------------------------- 0.8/12.8 MB 4.0 MB/s eta 0:00:04
     --- ------------------------------------ 1.3/12.8 MB 5.1 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 6.0 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 6.7 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 7.4 MB/s eta 0:00:02
     ----------- ---------------------------- 3.

In [3]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

## 3. Load the twentieth-century text file

In [5]:
# Load the file
with open('key_events_of_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

## 4. Evaluate whether the text needs wrangling

#### I decided that the text file needed to be cleaned since there are stop wors and unnecessary puncation in the data. Cleaning the data file will give a cleaner version of text to analyze.

In [6]:
# Sentence tokenization

from nltk.tokenize import sent_tokenize
tokenized_sent = sent_tokenize(data)

In [7]:
# Word tokenization

from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(data)

In [8]:
# Defining stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [9]:
# Removing stopwords in words

filtered_words = [] # creates an empty list
for word in tokenized_word:
    if word not in stop_words:
        filtered_words.append(word)

In [10]:
# Substitute all punctuation marks with a space

sans_punc = re.sub("[^a-zA-Z]",  # Search for all non-letters
                        " ",        # Replace all non-letters with spaces
                        str(filtered_words))

In [11]:
# Word tokenization

tokenized_word_2 = word_tokenize(sans_punc)

In [12]:
new_stopwords = ["And", "Then", 'n', 't', 's', 'The', 'In', 'would', 'S', 'II', 'would', 'p']

In [13]:
filtered = []
for word in tokenized_word_2:
    if word not in new_stopwords:
        filtered.append(word)

In [14]:
#Combine the words
listToStr = ' '.join([str(elem) for elem in filtered])

In [15]:
#Exporting Text File
with open('KeyEvents_20thCentury_WrangledData.txt', 'w') as textfile:
    textfile.write(listToStr)

## 5. Create NER objects

In [16]:
# Creating the NER object
book = NER(listToStr)

In [17]:
# Visualize identified entities
displacy.render(book[27:200], style = "ent", jupyter = True)

## 6. Split the sentence entities from the NER object

In [18]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [19]:
df_sentences.head(20)

Unnamed: 0,sentence,entities
0,"(Key, events, th, century, WikipediaJump, cont...","[Navigation Main, WikipediaContact, Contribute..."
1,"(Create, account, Log, Pages, logged, editors,...","[Log Pages, World War I Russian Revolution Com..."
2,"(Operation, Overlord, Final, days, war, Pacifi...","[days, Pacific Background Japanese Expansion, ..."
3,"(See, also, References, Sources, External, lin...",[References Sources External]
4,"(history, General)",[]
5,"(What, links, hereRelated, changesUpload, file...","[URLDownload, Download PDFPrintable]"
6,"(Wikimedia, CommonsFrom, Wikipedia, free, ency...","[Wikimedia CommonsFrom Wikipedia, World Wars, ..."
7,"(Kaiser, Wilhelm, Much, map, Europe, redrawn, ...","[Kaiser Wilhelm, Europe, Yugoslavia, Czechoslo..."
8,"(However, European, revolutions, defeated, Vla...","[European, Vladimir Lenin, Joseph Stalin, Leon..."
9,"(Many, people, saw, first, stage, end, capital...","[first, Soviet Union, Dorothea Lange, Migrant ..."


## 7. Filter the entities so that you end up only with the ones from your countries list

In [27]:
# Import countries
country_df = pd.read_csv("countries_list.csv", index_col = 0)

In [28]:
country_df.head(10)

Unnamed: 0,Country,Times mentioned
0,Afghanistan,1
1,Albania,2
2,Algeria,1
3,Andorra,0
4,Angola,1
5,Antigua and Barbuda,0
6,Argentina,0
7,Armenia,0
8,Australia,2
9,Austria,5


In [29]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['Country'])]

In [30]:
# Check

filter_entity(["Australia"], country_df)

['Australia']

In [31]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [32]:
df_sentences['country_entities'].head(20)

0                                                    []
1                                                    []
2                                                    []
3                                                    []
4                                                    []
5                                                    []
6     [France, Italy, Russia, Germany, Austria, Hung...
7                                                    []
8                                                    []
9     [Germany, Italy, Germany, Germany, Germany, Un...
10                                                   []
11    [Spain, France, Poland, Poland, France, German...
12                                                   []
13                         [Estonia, Latvia, Lithuania]
14    [Finland, Germany, Poland, Luxembourg, Norway,...
15            [Denmark, Sweden, France, France, France]
16                                             [France]
17                              [France, Italy, 

In [33]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [34]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
95,"(We, Can, Still, Learn, From, It, Time, Retrie...",[Sino Japanese War Anti Japanese War Eight Yea...,"[Italy, Japan]"
96,"(Retrieved, Forgotten, Reason, Why, Japan, Att...","[Japan, Pearl Harbor National Interest, United...","[Japan, United States, Japan]"
103,"(ONLINE, Retrieved, Andrew, Glass, August, Hir...","[Andrew Glass, August, Hirohito, Japan, Aug PO...",[Japan]
109,"(Retrieved, Kirsch, Adam, April, System, Two, ...","[Two, Nazi, December, Perspectives Clinical Re...",[Israel]
118,"(Preparatory, Commission, Comprehensive, Nucle...","[BBC News, Retrieved Nuclear, Stockholm Intern...",[Iran]
119,"(Retrieved, Why, Did, League, Nations, Fail, w...","[Retrieved Why Did League Nations Fail, un, St...",[India]
120,"(Portuguese, Africa, Oxford, Research, Encyclo...","[Portuguese, Africa Oxford Research Encycloped...",[Afghanistan]
122,"(Africa, Retrieved, Yalta, Conference, Cold, W...","[Cold War, March, Roberts Geoffrey, World War ...",[Russia]
129,"(Fall, Saigon, Time, Retrieved, Woollacott, Ma...","[Saigon Time, Woollacott Martin, April, Forty ...",[Vietnam]
144,"(Guardian, ISSN, Retrieved, Corstange, Daniel,...","[Guardian ISSN Retrieved Corstange, Middle Eas...",[India]


## 8. Create the relationship dataframe

In [35]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [36]:
relationship_df = pd.DataFrame(relationships)

In [37]:
relationship_df

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Russia,Germany
3,Germany,Austria
4,Austria,Hungary
...,...,...
745,India,Afghanistan
746,Afghanistan,Russia
747,India,Afghanistan
748,Afghanistan,Russia


In [38]:
# Sort the cases with a- >b and b- >a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Germany,Russia
3,Austria,Germany
4,Austria,Hungary


In [39]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Italy,15
1,Italy,Russia,6
2,Germany,Russia,18
3,Austria,Germany,6
4,Austria,Hungary,6
5,Bulgaria,Hungary,6
6,Bulgaria,Russia,6
7,Germany,Ukraine,12
8,Germany,Italy,18
9,Germany,United States,17


## 9. Save and expport your dataframe

In [40]:
# Export Relationship
relationship_df.to_csv('key_events_20th_century_relationship.csv')