### NAMED ENTITY RECOGNITION

In [1]:
import spacy
from nltk import sent_tokenize # import sentence tokenizer

In [2]:
# DOWNLOADING en-core-web-trf "named-entity-recog" MODEL
!python -m spacy download en_core_web_trf

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/457.4 MB ? eta -:--:--
     ---------------------------------------- 1.0/457.4 MB 3.3 MB/s eta 0:02:17
     ---------------------------------------- 2.9/457.4 MB 5.6 MB/s eta 0:01:22
     ---------------------------------------- 3.7/457.4 MB 5.7 MB/s eta 0:01:20
     ---------------------------------------- 5.2/457.4 MB 5.7 MB/s eta 0:01:20
      --------------------------------------- 6.3/457.4 MB 5.5 MB/s eta 0:01:22
      --------------------------------------- 7.3/457.4 MB 5.4 MB/s eta 0:01:24
      --------------------------------------- 8.1/457.4 MB 5.3 MB/s eta 0:01:25
      ------------------------------

  _torch_pytree._register_pytree_node(


### LOAD  en-core-web-trf MODEL

In [3]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp


In [4]:
nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  model.load_state_dict(torch.load(filelike, map_location=device))


### LOAD DATASET


In [5]:
import os
import sys
import pathlib


folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [6]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [7]:
df.head() # demo of subtitles dataset

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [8]:
sample_sript = df.iloc[0]['script'] # selecting 1 episode only
sample_sript

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [9]:
# convert the whole script to spliited sentences
sentences = sent_tokenize(sample_sript)
sentences = sentences[60:90]

In [10]:
sentence = ".".join(sentences)
sentence

"Don't you know who the Hokage leaders are?.Of course, I do!.I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?.Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox..Then why did you do that?.Because I'll become a Hokage myself..And I'll be the greatest Hokage of all time!.So that everyone will finally learn to accept me!.By the way, Sensei, I have a favor to ask..You want another bowl?.Mmmm…No…\n Can I borrow that Leaf headband for a while?.This?.No no!.This is worn only by those who have graduated from Ninja Academy..Tomorrow, you will…\n You're so mean!.So that's why you took off your goggles…\n Humph... One more bowl please!.We are now about to begin the graduation test..When your name is called, proceed to the next classroom..The test is on the Clone Jutsu..Oh no…\n Of all the…!.That is my weakest Jutsu!.But still… I will do it no matter what!.Clone Jutsu!.Disqualified!.Iruka Sensei..His physica

### RUN MODEL

In [None]:
doc = nlp_model(sentence) # classifys and extracts all the entities using the model

doc.ents # prints the  entities uisng en_core_web_trf model

(Hokage, Ninja, Fourth, nine, Leaf, Ninja Academy, Humph, One, three, Naruto)

In [12]:
# see label of each entity
for entity in doc.ents:
    print(f"entity : {entity}, label : {entity.label_}")

entity : Hokage, label : PERSON
entity : Ninja, label : NORP
entity : Fourth, label : ORDINAL
entity : nine, label : CARDINAL
entity : Leaf, label : PERSON
entity : Ninja Academy, label : ORG
entity : Humph, label : PERSON
entity : One, label : CARDINAL
entity : three, label : CARDINAL
entity : Naruto, label : PERSON


#### EXTRACT ONLY THE NAME/PERSON ENTITIES

In [13]:
# PUT ALL CODES ABOVE in THIS FUNCTION : 
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)
    ner_output = []
    
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        
        ners = set() # for avoiding duplicates set is best
        
        for entity in doc.ents:
            # only get the named/person entities
            if entity.label_ == "PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    
    return ner_output # see doc , this is the NAMED ENTITIES LIST
            
            
            
            
            

In [None]:
df = df.head(10) # for testing only 10 rows/episodes
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


In [16]:
df['ners'] = df['script'].apply(get_ners_inference) # apply model on df (10 rows/episodes here)

In [17]:
df

Unnamed: 0,episode,script,ners
0,1,"A long time ago, a powerful demon fox appeared...","[{}, {}, {}, {}, {}, {}, {}, {Naruto}, {}, {},..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Konohama..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Sakura, Sasuke}, {}, {Konohamaru..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Naruto}, {}, {}, {Iruka}, {}, {N..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {Sasuke}, {}, {Naruto}, {}, {Naruto},..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {Sasuke}, {}, {},..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {Zabuza}, {}, {}, {}, {Naruto..."


### CHARACTER NETWORK

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

#### CONVERT TO "NUMBER OF OCCURENCES PER CHAR PAIR"

In [None]:
def generate_character_network()