In [108]:
# import libraries
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [110]:
NER = spacy.load("en_core_web_sm")

# Load Alice in Wonderland book

In [111]:
# Load the book
with open('Alice_article_Wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

book = NER(data)

In [112]:
# Visualize identified entities
displacy.render(book[273:20000], style = "ent", jupyter = True)

# Splitting Sentence Entities

In [113]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [114]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Alice, 's, Adventures, in, Wonderland, ...",[Wonderland - Wikipedia ...
1,"(Edit, links, , ArticleTalk, ,...","[English, Tools Tools]"
2,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Download, PDFPrintable]"
3,"(Wikimedia, CommonsWikiquoteWikisourceWikidata...","[Wikimedia, Wikipedia, 1865, Lewis Carroll ""Al..."
4,"(For, other, uses, ,, see, Alice, in, Wonderla...","[Alice, Wonderland]"
5,"(Alice, 's, Adventures, in, Wonderland, First,...","[Alice's Adventures, Wonderland, 1865)AuthorLe..."
6,"(It, details, the, story, of, a, girl, named, ...",[Alice]
7,"(It, is, seen, as, an, example, of, the, liter...",[]
8,"(The, artist, Sir, John, Tenniel, provided, 42...","[John Tenniel, 42]"
9,"(It, received, positive, reviews, upon, releas...",[Victorian]


In [115]:
# Import characters
character_df = pd.read_csv("alice_characters_with_aliases.csv", index_col = 0)

In [116]:
character_df.head()

Unnamed: 0_level_0,character_alias
character,Unnamed: 1_level_1
Alice,Alice
The White Rabbit,Rabbit
The Mouse,Mouse
The Dodo,Dodo
The Lory,Lory


# Filtering Data

In [117]:
# Function to filter out entities not of interest
def filter_entity(ent_list, character_df):
       return [ent for ent in ent_list
                  if ent in list(character_df['character_alias'])]

In [118]:
# Check
filter_entity(["Alice", "CF", "2"], character_df)

['Alice']

In [119]:
# Apply lambda function
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [120]:
# Filter out sentences that don’t have any character entities
df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
791,"(Alice, and, the, Land, that, Wonders, (, 2020...","[Alice, 2020, Alice, 2021, Wonderland, 1962]","[Alice, Alice]"
792,"(Alice, in, Wonderland, or, What, 's, a, Nice,...","[Alice, Wonderland, 1966]",[Alice]
793,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
796,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
797,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
798,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
800,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
801,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
802,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
804,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"


In [121]:
# Make an explicit copy to avoid chained assignment warning
df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0].copy()

# Modify the column safely using .loc
df_sentences_filtered.loc[:, 'character_entities'] = df_sentences_filtered['character_entities'].apply(
    lambda x: [item.split()[0] for item in x]
)

In [122]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
791,"(Alice, and, the, Land, that, Wonders, (, 2020...","[Alice, 2020, Alice, 2021, Wonderland, 1962]","[Alice, Alice]"
792,"(Alice, in, Wonderland, or, What, 's, a, Nice,...","[Alice, Wonderland, 1966]",[Alice]
793,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
796,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
797,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
798,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
800,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
801,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
802,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
804,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"


# Creating Relationships

In [123]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [124]:
relationship_df = pd.DataFrame(relationships)

In [125]:
relationship_df

Unnamed: 0,source,target
0,Alice,Caterpillar
1,Alice,Caterpillar
2,Caterpillar,Alice
3,Caterpillar,Alice
4,Alice,Caterpillar
...,...,...
201,Alice,Cheshire
202,Cheshire,Alice
203,Alice,Cheshire
204,Alice,Cheshire


In [126]:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationships_df.head(5)

Unnamed: 0,source,target
0,Alice,Caterpillar
1,Alice,Caterpillar
2,Alice,Caterpillar
3,Alice,Caterpillar
4,Alice,Caterpillar


In [127]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [129]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Alice,Caterpillar,7
1,Caterpillar,Alice,11
2,Alice,Duchess,11
3,Duchess,Alice,12
4,Alice,Hatter,17
5,Hatter,Dormouse,12
6,Dormouse,Hatter,5
7,Hatter,Alice,9
8,Alice,Queen,19
9,Queen,Duchess,4


In [130]:
relationship_df.to_csv('alice_relationship.csv')