# Named Entit recognition

In [3]:
import spacy
from nltk import sent_tokenize

In [4]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


You should consider upgrading via the 'C:\Users\prana\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


# Load model

In [5]:
import spacy
spacy.cli.download("en_core_web_trf")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [7]:
nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load(filelike, map_location=device))


# load Dataset

In [8]:
import os 
import sys
import pathlib
import pandas as pd
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [9]:
import os
import pandas as pd
from glob import glob

def load_subtitles_dataset(dataset_path):
    # Get all subtitle files in the directory
    subtitles_paths = glob(dataset_path + '/*.ass')

    scripts = []
    episode_num = []

    for path in subtitles_paths:
        # Read lines
        with open(path, 'r', encoding='utf-8', errors='ignore') as file:
            lines = file.readlines()
            lines = lines[27:]  # Skip the first 27 lines if needed
            lines = [" ".join(line.split(',')[9:]) for line in lines]
            
            # Combine all lines into a single script
            lines = [line.replace('\\N', ' ') for line in lines]
            script = " ".join(lines)

        # Extract episode number from the file path/name
        episode = int(path.split('-')[-1].split('.')[0].strip())
        scripts.append(script)
        episode_num.append(episode)
    
    # Create the DataFrame with two columns: 'episode' and 'script'
    df = pd.DataFrame.from_dict({"episode": episode_num, "script": scripts})
    return df

In [10]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [11]:
df.head()

Unnamed: 0,episode,script
0,1,A long time ago a powerful demon fox appeared...
1,2,C'mon!\n Running like a fugitive \n Being chas...
2,3,C'mon!\n Running like a fugitive \n Being chas...
3,4,C'mon!\n Running like a fugitive \n Being chas...
4,5,C'mon!\n Running like a fugitive \n Being chas...


In [12]:
sample_script = df.iloc[0]['script']
sample_script

'A long time ago  a powerful demon fox appeared with nine tails.\n With its powerful tails \n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster \n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem  sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey  Naruto!\n How did you suddenly get here  lruka Sensei?\n The question is what are you doing here when you should 

In [13]:
sentences = sent_tokenize(sample_script)

In [14]:
sentences

['A long time ago  a powerful demon fox appeared with nine tails.',
 'With its powerful tails \n it could smash mountains and create tidal waves.',
 'A band of Ninjas rose to defend their village from attack.',
 'We have to wait until the Fourth Hokage gets here!',
 "We can't let it get any closer to our village!",
 'One great Ninja was able to imprison the monster \n but died in the process.',
 'This Ninja was known as… the Fourth Hokage.',
 'Naruto!',
 'Why did you do such a thing?!',
 "You're really gonna get it this time!",
 "I don't care!",
 'You know your problem?',
 "You can't do the things I do!",
 'Only I can do this!',
 "I'm better than all of you!",
 'Believe it!',
 "There's a problem  sir!",
 'Lord Hokage!',
 'What is it?',
 'Did that Naruto do something again?',
 'Yes.',
 'He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!',
 'Wait!',
 'Ha ha…\n Why should I?',
 'Hey  Naruto!',
 'How did you suddenly get here  lruka Sensei?',
 'The q

In [15]:
sentences = sentences[60:90]

In [16]:
sentence = ".".join(sentences)

In [17]:
sentence

"Don't you know who the Hokage leaders are?.Of course  I do!.I know they earned the title Lord Hokage\n because they were the best Ninja of their time  right?.Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox..Then why did you do that?.Because I'll become a Hokage myself..And I'll be the greatest Hokage of all time!.So that everyone will finally learn to accept me!.By the way  Sensei  I have a favor to ask..You want another bowl?.Mmmm…No…\n Can I borrow that Leaf headband for a while?.This?.No no!.This is worn only by those who have graduated from Ninja Academy..Tomorrow  you will…\n You're so mean!.So that's why you took off your goggles…\n Humph... One more bowl please!.We are now about to begin the graduation test..When your name is called  proceed to the next classroom..The test is on the Clone Jutsu..Oh no…\n Of all the…!.That is my weakest Jutsu!.But still… I will do it no matter what!.Clone Jutsu!.Disqualified!.Iruka Sensei..His physica

# Run Model

In [18]:
doc = nlp_model(sentence)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [19]:
doc.ents

(Ninja, Fourth, nine, Leaf, Ninja Academy, Tomorrow, One, three, Naruto, one)

In [20]:
for entity in doc.ents:
    print(entity, entity.label_)

Ninja NORP
Fourth ORDINAL
nine CARDINAL
Leaf PRODUCT
Ninja Academy ORG
Tomorrow DATE
One CARDINAL
three CARDINAL
Naruto PERSON
one CARDINAL


In [21]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [22]:
df = df.head(10)

In [23]:
df

Unnamed: 0,episode,script
0,1,A long time ago a powerful demon fox appeared...
1,2,C'mon!\n Running like a fugitive \n Being chas...
2,3,C'mon!\n Running like a fugitive \n Being chas...
3,4,C'mon!\n Running like a fugitive \n Being chas...
4,5,C'mon!\n Running like a fugitive \n Being chas...
5,6,C'mon!\n Running like a fugitive \n Being chas...
6,7,C'mon!\n Running like a fugitive \n Being chas...
7,8,C'mon!\n Running like a fugitive \n Being chas...
8,9,C'mon!\n Running like a fugitive \n Being chas...
9,12,C'mon!\n Running like a fugitive \n Being chas...


In [24]:
df['ners'] = df['script'].apply(get_ners_inference)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [25]:
df

Unnamed: 0,episode,script,ners
0,1,A long time ago a powerful demon fox appeared...,"[{}, {}, {}, {}, {}, {}, {}, {Naruto}, {}, {},..."
1,2,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Konohama..."
2,3,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {Sakura, Sasuke}, {}, {Konohamaru..."
3,4,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {Naruto}, {}, {}, {Iruka}, {}, {N..."
4,5,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
5,6,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {Sasuke}, {Sakura}, {Naruto}, {}, {Na..."
6,7,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
7,8,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {}, {}, {}, {Sasuke}, {}, {},..."
8,9,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
9,12,C'mon!\n Running like a fugitive \n Being chas...,"[{}, {}, {}, {}, {Zabuza}, {}, {}, {}, {Naruto..."


# Character Network

In [26]:
!pip install pyvis



You should consider upgrading via the 'C:\Users\prana\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [28]:
def generate_character_network(df):

    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df



In [29]:
relationship_df = generate_character_network(df)

In [30]:
relationship_df

Unnamed: 0,source,target,value
118,Naruto,Sasuke,117
144,Sakura,Sasuke,65
117,Naruto,Sakura,41
66,Iruka,Naruto,37
111,Mizuki,Naruto,28
...,...,...,...
57,Inari,Sasuke,1
54,Hokage,Tazuna,1
53,Hokage,Sensei,1
48,Hokage,Mizuki,1


In [31]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [32]:
G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")


naruto.html
