# 1.6 Intro to NLP and Network Analysis

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 6.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load Alice in Wonderland book

In [4]:
# Load the book

with open('alice_in_wonderland.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [5]:
book = NER(data)

In [6]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## Get named entity list per sentence

In [7]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [8]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Alice, 's, Adventures, in, Wonderland, ...","[Alice, Adventures, Lewis Carroll ..."
1,"(I, , Down, the, Rabbit, ...","[Alice, Alice]"
2,"(So, she, was, considering, in, her, own, mind...",[WhiteRabbit]
3,"(There, was, nothing, so, VERY, remarkable, in...",[Rabbit]
4,"(Oh, dear, !, )",[]
5,"(I, shall, be, late, !, ', )",[]
6,"((, when, she, thoughtit, over, afterwards, ,,...",[Alice]
7,"(In, another, moment, down, went, Alice, after...",[Alice]
8,"(The, rabbit, -, hole, went, straight, on, lik...",[Alice]
9,"(Either, the, well, was, very, deep, ,, or, sh...",[]


## Load character names

In [9]:
# Import characters

character_df = pd.read_csv("characters_alice.csv", index_col = 0)

In [10]:
character_df.head()

Unnamed: 0,character,character_alias
0,Alice,Alice
1,The White Rabbit,Rabbit
2,The Mouse,Mouse
3,The Dodo,Dodo
4,The Lory,Lory


## Filtering entities from the book

In [11]:
# Function to filter out entities not of interest

def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df['character_alias'])]

In [12]:
# Check

filter_entity(["Alice", "CF", "2"], character_df)

['Alice']

In [13]:
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [14]:
df_sentences['character_entities'].head(20)

0            [Alice]
1     [Alice, Alice]
2                 []
3           [Rabbit]
4                 []
5                 []
6            [Alice]
7            [Alice]
8            [Alice]
9                 []
10                []
11                []
12           [Alice]
13                []
14                []
15                []
16                []
17                []
18                []
19                []
Name: character_entities, dtype: object

In [15]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

In [16]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
1483,"(`, The, idea, of, havingthe, sentence, first,...","[first, Queen]",[Queen]
1484,"(`, I, wo, n't, !, ', said, Alice, ., )",[Alice],[Alice]
1486,"(the, Queen, shouted, at, the, top, of, her, v...",[Queen],[Queen]
1488,"(`, Who, cares, for, you, ?, ', said, Alice, ,...",[Alice],[Alice]
1491,"(`, Wake, up, ,, Alice, dear, !, ')",[Alice],[Alice]
1493,"(`, Oh, ,, I, 've, had, such, a, curious, drea...",[Alice],[Alice]
1494,"(So, Alice, got, up, and, ran, off, ,, thinkin...",[Alice],[Alice]
1495,"(But, her, sister, sat, still, just, as, she, ...","[Alice, Adventures, First, Alice]","[Alice, Alice]"
1496,"(The, long, grass, rustled, at, her, feet, as,...","[Mouse, March, Queen, Gryphon]","[Mouse, Queen, Gryphon]"
1497,"(So, she, sat, on, ,, with, closed, eyes, ,, a...","[half, Queen, Gryphon]","[Queen, Gryphon]"


## Create relationships

In [17]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [18]:
relationship_df = pd.DataFrame(relationships)

In [19]:
relationship_df

Unnamed: 0,source,target
0,Alice,Rabbit
1,Alice,Rabbit
2,Rabbit,Alice
3,Rabbit,Alice
4,Rabbit,Alice
...,...,...
1896,Queen,Gryphon
1897,Mouse,Queen
1898,Queen,Gryphon
1899,Gryphon,Queen


In [20]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,Alice,Rabbit
1,Alice,Rabbit
2,Alice,Rabbit
3,Alice,Rabbit
4,Alice,Rabbit


In [21]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [22]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Alice,Rabbit,90
1,Alice,Duchess,229
2,Alice,Mouse,110
3,Dodo,Mouse,10
4,Dodo,Lory,6
5,Eaglet,Lory,6
6,Alice,Eaglet,5
7,Alice,Lory,28
8,Lory,Mouse,29
9,Duck,Mouse,9


In [23]:
relationship_df.to_csv('alice_relationship.csv')