Named Entity Recognition

In [27]:
import spacy
import nltk
from nltk import sent_tokenize
import spacy_transformers
import curated_transformers
import transformers

In [28]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.5/457.4 MB 3.3 MB/s eta 0:02:18
     ---------------------------------------- 1.3/457.4 MB 3.5 MB/s eta 0:02:10
     ---------------------------------------- 2.1/457.4 MB 3.4 MB/s eta 0:02:13
     ---------------------------------------- 2.9/457.4 MB 3.4 MB/s eta 0:02:13
     ---------------------------------------- 3.4/457.4 MB 3.4 MB/s eta 0:02:14
     ---------------------------------------- 4.2/457.4 MB 3.3 MB/s eta 0:02:17
     ---------------------------------------- 5.0/457.4 MB 3.3 MB/s eta 0:02:17
     ---------------------------------------- 5.5/457.4 MB 3.4 MB/s eta 0:02:15
      --------------------------------------- 6.3/457.4 MB 3.4 MB/s eta 0:02:15
      ---------------------

  _torch_pytree._register_pytree_node(


In [32]:
from spacy.lang.en import English
nlp = English()

# Construction from scratch
from spacy.vocab import Vocab
from spacy.language import Language
nlp = Language(Vocab())

In [37]:
# Load Model
def load_model():
    nlp = spacy.load('en_core_web_trf')
    return nlp

In [50]:
nlp_model = load_model()

ValueError: [E002] Can't find factory for 'curated_transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer

In [40]:
# Load dataset
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [41]:
dataset_path = 'data/Subtitles'
df = load_subtitles_dataset(dataset_path)

In [42]:
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [43]:
sample_script = df.iloc[0]['script']
sample_script

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [44]:
sentences = sent_tokenize(sample_script)
sentences

['A long time ago, a powerful demon fox appeared with nine tails.',
 'With its powerful tails,\n it could smash mountains and create tidal waves.',
 'A band of Ninjas rose to defend their village from attack.',
 'We have to wait until the Fourth Hokage gets here!',
 "We can't let it get any closer to our village!",
 'One great Ninja was able to imprison the monster,\n but died in the process.',
 'This Ninja was known as… the Fourth Hokage.',
 'Naruto!',
 'Why did you do such a thing?!',
 "You're really gonna get it this time!",
 "I don't care!",
 'You know your problem?',
 "You can't do the things I do!",
 'Only I can do this!',
 "I'm better than all of you!",
 'Believe it!',
 "There's a problem, sir!",
 'Lord Hokage!',
 'What is it?',
 'Did that Naruto do something again?',
 'Yes.',
 'He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!',
 'Wait!',
 'Ha ha…\n Why should I?',
 'Hey, Naruto!',
 'How did you suddenly get here, lruka Sensei?',
 'The q

In [45]:
sentences[60:90]

["Don't you know who the Hokage leaders are?",
 'Of course, I do!',
 'I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?',
 'Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox.',
 'Then why did you do that?',
 "Because I'll become a Hokage myself.",
 "And I'll be the greatest Hokage of all time!",
 'So that everyone will finally learn to accept me!',
 'By the way, Sensei, I have a favor to ask.',
 'You want another bowl?',
 'Mmmm…No…\n Can I borrow that Leaf headband for a while?',
 'This?',
 'No no!',
 'This is worn only by those who have graduated from Ninja Academy.',
 "Tomorrow, you will…\n You're so mean!",
 "So that's why you took off your goggles…\n Humph... One more bowl please!",
 'We are now about to begin the graduation test.',
 'When your name is called, proceed to the next classroom.',
 'The test is on the Clone Jutsu.',
 'Oh no…\n Of all the…!',
 'That is my weakest Jutsu!',
 'But sti

In [46]:
sentence = " ".join(sentences)
sentence

'A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can\'t let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You\'re really gonna get it this time! I don\'t care! You know your problem? You can\'t do the things I do! Only I can do this! I\'m better than all of you! Believe it! There\'s a problem, sir! Lord Hokage! What is it? Did that Naruto do something again? Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them! Wait! Ha ha…\n Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed 

In [None]:
doc = nlp_model(sentence)

In [None]:
doc.ents

In [None]:
for entity in doc.ents:
    print(entity, entity.label_) # Named Entity Recognition

Leaf Person
Ninja Academy ORG
Tomorrow DATE

In [None]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ners_output = []
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == 'PERSON':
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name=first_name.strip()
                ners.add(first_name)
            ners_output.append(ners)
    return ners_output         


In [None]:
df['ners'] = df['script'].apply(get_ners_inference)
df

episode        script                     ners
0   1        A longtime ago, a powerful..  [{},{},{Naruto},{}]

In [51]:
# Character Network
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [52]:
def generate_char_network(df):
    windows = 10
    entity_relationship = []
    for row in df['ners']:
        prev_entities_in_window = []

        for sentence in row:
            prev_entities_in_window.append(list(sentence))
            prev_entities_in_window = prev_entities_in_window[-windows:]

            #Flatten 2D List into 1D List
            prev_entities_flattened = sum(prev_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in prev_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])   
    relationship_df = relationship_df.groupby(['source','target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df


In [None]:
relationship_df = generate_char_network(df)
relationship_df

source   target   value
Naruto   Sasuke    117
Sakura   Sasuke    65
Iruka    Naruto    43

In [None]:
relationship_df = relationship_df.head(200)
G = nx.from_pandas_edgelist(relationship_df, source='source',target='target',edge_attr='value',create_using=nx.Graph())

In [None]:
net = Network(notebook=True, width='1000px', height='700px', bgcolor='#222222', font_color='white',cdn_resources='remote')

In [None]:
node_degree = dict(G.degree)
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show('nauto.html')