# Named Entity Recognition

In [89]:
import spacy
from nltk import sent_tokenize


In [90]:
# !pip install spacy[transformers] spacy-curated-transformers
# !python -m spacy download en_core_web_trf

# Load Model

In [91]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [92]:
nlp_model = load_model()

# Load Dataset

In [93]:
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [94]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [95]:
df.head()

Unnamed: 0,episode,script
0,163,I want to try and gatherthe unrestrained winds...
1,48,"Press down hard on the gas\n That’s right, the..."
2,60,"Yeah, turn your sadness into kindness,your uni..."
3,107,Connecting old wordsthat have been used up\n I...
4,66,"Yeah, turn your sadness into kindness,your uni..."


In [96]:
sample_script = df.iloc[0]['script']
sample_script

"I want to try and gatherthe unrestrained winds\n I’ll run toward the horizon,alongside the wave crests\n I’ve made my decision,\n even if there’s a long way to goon that road.\n I’ll continue towards the future I’ve planned\n Time rushes usThe heartbeat speeds up\n When I woke in the middle of a dreamI kept searching for that same light\n Under the shining star-lit sky withcountless constellations and shadows\n There was something beyond that…What were you gazing at?\n What were you gazing at?\n Is this really Moso’s house?\n Here he’s a superintendant officer,\n so I thought it’d be a more luxuriousbuilding with gold folding screens.\n Your voices are too loud.\n Lord Moso is a magnificent personwith no self-interests.\n So he lives frugally.\n But you know, he’s an old man\n who’s the second most important guyin the land next to the feudal lord, right?\n He smacks of being really poor.\n Naruto!\n Well well, how shameful of meto present such a poor residence.\n Lord Moso.\n What?!\n

In [97]:
sentences = sent_tokenize(sample_script)

In [98]:
sentences = sentences[90:120]

In [99]:
sentence = ".".join(sentences)

# Run Model

In [100]:
doc = nlp_model(sentence)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [101]:
doc.ents

(Sagi, three, Moso, Sagi, the Hidden Leaf Village, Naruto)

In [102]:
for entity in doc.ents:
    print(entity, entity.label_)

Sagi PERSON
three CARDINAL
Moso PERSON
Sagi PERSON
the Hidden Leaf Village ORG
Naruto PERSON


In [103]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == "PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [104]:
df = df.head(10)

In [105]:
df

Unnamed: 0,episode,script
0,163,I want to try and gatherthe unrestrained winds...
1,48,"Press down hard on the gas\n That’s right, the..."
2,60,"Yeah, turn your sadness into kindness,your uni..."
3,107,Connecting old wordsthat have been used up\n I...
4,66,"Yeah, turn your sadness into kindness,your uni..."
5,205,"Fly into the wavy and twisted sky,into your ha..."
6,47,"Press down hard on the gas\n That’s right, the..."
7,88,We are Fighting Dreamers aiming high\n Fightin...
8,2,"C'mon!\n Running like a fugitive,\n Being chas..."
9,108,Connecting old wordsthat have been used up\n I...


In [106]:
df['ners'] = df['script'].apply(get_ners_inference)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [107]:
df

Unnamed: 0,episode,script,ners
0,163,I want to try and gatherthe unrestrained winds...,"[{}, {}, {}, {Moso}, {}, {}, {Moso}, {}, {}, {..."
1,48,"Press down hard on the gas\n That’s right, the...","[{}, {}, {}, {}, {}, {}, {}, {}, {Naruto}, {},..."
2,60,"Yeah, turn your sadness into kindness,your uni...","[{}, {}, {}, {Naruto}, {}, {}, {Sasuke}, {Dosu..."
3,107,Connecting old wordsthat have been used up\n I...,"[{}, {}, {Naruto, Sasuke}, {}, {Naruto, Sasuke..."
4,66,"Yeah, turn your sadness into kindness,your uni...","[{}, {}, {}, {}, {}, {}, {}, {Bushy}, {Guy}, {..."
5,205,"Fly into the wavy and twisted sky,into your ha...","[{}, {}, {}, {Yakumo}, {}, {}, {Yakumo}, {}, {..."
6,47,"Press down hard on the gas\n That’s right, the...","[{}, {Hinata, Neji}, {}, {}, {Hinata}, {Naruto..."
7,88,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {}, {}, {},..."
8,2,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {Konohamaru},..."
9,108,Connecting old wordsthat have been used up\n I...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."


# Character Network

In [108]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [109]:
def generate_character_network(df):
    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2d list into 1d list
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))

    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df


In [110]:
relationship_df = generate_character_network(df)

In [111]:
relationship_df

Unnamed: 0,source,target,value
305,Naruto,Sasuke,62
263,Kurenai,Yakumo,38
282,Lee,Sasuke,31
145,Hinata,Neji,30
295,Naruto,Neji,26
...,...,...,...
210,Jutsu,takeSasuke,1
208,Jutsu,Tayuya,1
46,Byakugan,the,1
203,Jutsu,Rasengan,1


In [112]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [113]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width='1000px', height='700px', bgcolor='#222222', font_color='white', cdn_resources='remote')
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")

naruto.html
