In [67]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch, nltk

In [3]:
import torch
torch.mps.empty_cache()

## Load Model

In [68]:
model_ckpt = 'dslim/bert-large-NER'
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

def load_model(task, model_ckpt, device):
    
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForTokenClassification.from_pretrained(model_ckpt)
    
    task_pipeline = pipeline(task=task,model=model,tokenizer=tokenizer,device=device)
    return task_pipeline

ner_pipeline = load_model('ner',model_ckpt,device)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [64]:
from glob import glob
import numpy as np

files = glob('../data/characterNetworkData/CloneWarsS1/*.srt')
movies = glob('../data/characterNetworkData/*.txt')

files = list(np.append(files,movies))

In [61]:
import os
import numpy as np
import pandas as pd
path = '../data/characterNetworkData/'

def load_dataset(path):
    scripts = []
    episodes_num = []
    clone_wars = glob(path + 'CloneWarsS1/*.srt')
    movies = glob(path + '/*.txt')
    files = list(np.append(clone_wars,movies))
    for path in files:
        dialogue = []
        
        with open(path, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                line = lines[i].replace('\n','').replace('</i>','').replace('<i>','')
                if (line.isnumeric() == False) and (('-->' in line) == False) and (line != ''):
                    
                    dialogue.append(line)
                    
        script = " ".join(dialogue)
        scripts.append(script)
        
        if 'CloneWarsS1' in path:
            episodes_num.append(path.split('CloneWarsS1/')[1][:6])
        else:
            episodes_num.append('Star Wars ' + path[29:37].capitalize())
            
    df = pd.DataFrame.from_dict({"episode": episodes_num, "script": scripts})
    return df

df = load_dataset(path)
        

In [63]:
df.head()

Unnamed: 0,episode,script
0,S01E03,A deadly weapon unleashed! The Separatist batt...
1,S01E17,Battle droids on Naboo! As the Separatist rebe...
2,S01E15,Republic outpost overrun! The Jedi have lost a...
3,S01E02,The clone starfleet is under siege! Dozens of ...
4,S01E12,Dooku held for ransom! After escaping capture ...


In [69]:
script_sentences = nltk.sent_tokenize(df.iloc[0]['script'])

In [71]:
docs = ner_pipeline(script_sentences)
docs

[[],
 [{'entity': 'B-MISC',
   'score': 0.9698607,
   'index': 2,
   'word': 'Sep',
   'start': 4,
   'end': 7},
  {'entity': 'I-MISC',
   'score': 0.74766105,
   'index': 4,
   'word': '##tist',
   'start': 10,
   'end': 14},
  {'entity': 'B-MISC',
   'score': 0.52173,
   'index': 6,
   'word': 'Male',
   'start': 26,
   'end': 30},
  {'entity': 'I-ORG',
   'score': 0.5176527,
   'index': 8,
   'word': '##len',
   'start': 32,
   'end': 35},
  {'entity': 'I-MISC',
   'score': 0.5753543,
   'index': 9,
   'word': '##ce',
   'start': 35,
   'end': 37},
  {'entity': 'B-MISC',
   'score': 0.8194574,
   'index': 13,
   'word': 'Republic',
   'start': 60,
   'end': 68}],
 [{'entity': 'B-PER',
   'score': 0.9865578,
   'index': 9,
   'word': 'Ana',
   'start': 41,
   'end': 44},
  {'entity': 'I-PER',
   'score': 0.80660236,
   'index': 10,
   'word': '##kin',
   'start': 44,
   'end': 47},
  {'entity': 'I-PER',
   'score': 0.99745065,
   'index': 11,
   'word': 'Sky',
   'start': 48,
   'end

In [167]:
def reconstructEntities(docs):

    reconstructed_entities = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []}
    current_entity = ''
    current_type = None
    previous_end = None
    
    if docs != None:
        for entity in docs:
            word = entity['word']
            entity_tag = entity['entity']
            start = entity['start']
            end = entity['end']
            
            # Determine the type of entity (PER, LOC, ORG, MISC)
            entity_type = entity_tag.split('-')[-1]
            
            if entity_tag.startswith('B-') or (entity_tag.startswith('I-') and (previous_end is None or start > previous_end + 3)):
                # Finalize the previous entity if needed
                if current_entity and current_type:
                    if current_entity in ['Ahka','Soka','Ahka Tano']:
                        current_entity = 'Ahsoka'
                    elif current_entity in ['Skywalk','Skywalker','Ana','Anakin','An']:
                        current_entity = 'Anakin Skywalker'
                    elif current_entity in ['Grievous','Gvous','vous']:
                        current_entity = 'General Grievous'
                    elif current_entity in ['Obi - Wan','Obi','Wan','Kenobi','Obi - Wan Kenobi','Ben','Ben Kenobi']:
                        current_entity = 'Obi-Wan Kenobi'
                    elif current_entity in ['Plo']:
                        current_entity = 'Plo Koon'
                    elif current_entity in ['dooku','tyranus','darth tyranus','Doo','Dooku']:
                        current_entity = 'Count Dooku'
                    elif current_entity in ['Thi - Sen','Thi Sen']:
                        current_entity = 'Thi-Sen'
                    elif current_entity in ['Jar Jar','Binks','J Jar']:
                        current_entity = 'Jar Jar Binks'
                    elif current_entity in ['Hondo','Ohnaka']:
                        current_entity = 'Hondo Ohnaka'
                    elif current_entity in ['Vindi']:
                        current_entity = 'Nuvo Vindi'
                    elif current_entity in ['Pame','Amidala','Pa']:
                        current_entity = 'Padme Amidala'
                    elif current_entity in ['Paltine','Chancellor','Sidious','Darth Sidious']:
                        current_entity = 'Palpatine'
                    elif current_entity in ['Bane']:
                        current_entity = 'Cad Bane'
                    elif current_entity in ['Free Taa','Or Free Taa']:
                        current_entity = 'Orn Free Taa'
                    elif current_entity in ['Organa', 'Senator Organa']:
                        current_entity = 'Bail Organa'
                    elif current_entity in ['He']:
                        current_entity = 'Heavy'
                    elif current_entity in ['Windu','Master Windu']:
                        current_entity = 'Mace Windu'
                        
                    elif current_entity in ['Gunray']:
                        current_entity = 'Nute Gunray'
                    elif current_entity in ['Master Fisto','Fisto']:
                        current_entity = 'Kit Fisto'
                    elif current_entity in ['Nahdar']:
                        current_entity = 'Nahdar Vebb'
                    elif current_entity in ['Aayla','Secura']:
                        current_entity = 'Aayla Secura'
                    elif current_entity in ['Qui - Gon']:
                        current_entity = 'Qui Gon Jinn'
                    elif current_entity in ['Jango','Fett']:
                        current_entity = 'Jango Fett'
                    elif current_entity in ['Lord Vader','Vader']:
                        current_entity = 'Darth Vader'
                    elif current_entity in ['Luke']:
                        current_entity = 'Luke Skywalker'
                    elif current_entity in ['Solo','Han','Sol']:
                        current_entity = 'Han Solo'
                    elif current_entity in ['Land','Lando']:
                        current_entity = 'Lando Calrissian'
                    elif current_entity in ['Chew','Chewie']:
                        current_entity = 'Chewbacca'
                    elif current_entity in ['Yo']:
                        current_entity = 'Yoda'
                    elif current_entity in ['Le']:
                        current_entity = 'Leia'
                    elif current_entity in ['Bob Fe']:
                        current_entity = 'Boba Fett'
                    elif current_entity in ['Sep','Septist']:
                        current_entity = 'Separatist'
                    elif current_entity in ['Federation']:
                        current_entity = 'Trade Federation'
                    elif current_entity in ['Jab']:
                        current_entity = 'Jabba'
                    elif current_entity in ['Jed']:
                        current_entity = 'Jedi'
                    
                        
                    reconstructed_entities[current_type].append(current_entity)
                
                # Start a new entity
                current_entity = word if not word.startswith('##') else word[2:]
                current_type = entity_type
            
            elif entity_tag.startswith('I-') and current_type == entity_type:
                if word.startswith('##'):
                    # Continue the current entity with a subword
                    current_entity += word[2:]
                else:
                    # Continue the current entity with a new word
                    current_entity += f" {word}"
            
            # Update the previous end position
            previous_end = end
        
        # Finalize any entity left at the end of the sentence
        if current_entity and current_type:
            if current_entity in ['Ahka','Soka','Ahka Tano']:
                current_entity = 'Ahsoka'
            elif current_entity in ['Skywalk','Skywalker','Ana','Anakin','An','Sky', 'Ani']:
                current_entity = 'Anakin Skywalker'
            elif current_entity in ['Grievous','Gvous','vous']:
                current_entity = 'General Grievous'
            elif current_entity in ['Obi - Wan','Obi','Wan','Kenobi','Obi - Wan Kenobi','Ben','Ben Kenobi','Obi Wan','Kenob']:
                current_entity = 'Obi-Wan Kenobi'
            elif current_entity in ['Plo']:
                current_entity = 'Plo Koon'
            elif current_entity in ['dooku','tyranus','darth tyranus','Doo','Dooku']:
                current_entity = 'Count Dooku'
            elif current_entity in ['Thi - Sen','Thi Sen']:
                current_entity = 'Thi-Sen'
            elif current_entity in ['Jar Jar','Binks','J Jar']:
                current_entity = 'Jar Jar Binks'
            elif current_entity in ['Hondo','Ohnaka']:
                current_entity = 'Hondo Ohnaka'
            elif current_entity in ['Vindi']:
                current_entity = 'Nuvo Vindi'
            elif current_entity in ['Pame','Amidala','Pa','Padme','Pad']:
                current_entity = 'Padme Amidala'
            elif current_entity in ['Paltine','Chancellor','Sidious','Darth Sidious']:
                current_entity = 'Palpatine'
            elif current_entity in ['Bane']:
                current_entity = 'Cad Bane'
            elif current_entity in ['Free Taa','Or Free Taa']:
                current_entity = 'Orn Free Taa'
            elif current_entity in ['Organa', 'Senator Organa']:
                current_entity = 'Bail Organa'
            elif current_entity in ['He']:
                current_entity = 'Heavy'
            elif current_entity in ['Windu','Master Windu']:
                current_entity = 'Mace Windu'
            elif current_entity in ['Gunray']:
                current_entity = 'Nute Gunray'
            elif current_entity in ['Master Fisto','Fisto']:
                current_entity = 'Kit Fisto'
            elif current_entity in ['Nahdar']:
                current_entity = 'Nahdar Vebb'
            elif current_entity in ['Aayla','Secura']:
                current_entity = 'Aayla Secura'
            elif current_entity in ['Qui - Gon']:
                current_entity = 'Qui Gon Jinn'
            elif current_entity in ['Jango','Fett']:
                current_entity = 'Jango Fett'
            elif current_entity in ['Lord Vader','Vader']:
                current_entity = 'Darth Vader'
            elif current_entity in ['Luke']:
                current_entity = 'Luke Skywalker'
            elif current_entity in ['Solo','Han','Sol']:
                current_entity = 'Han Solo'
            elif current_entity in ['Land','Lando']:
                current_entity = 'Lando Calrissian'
            elif current_entity in ['Chew','Chewie']:
                current_entity = 'Chewbacca'
            elif current_entity in ['Yo']:
                current_entity = 'Yoda'
            elif current_entity in ['Le']:
                current_entity = 'Leia'
            elif current_entity in ['Bob Fe']:
                current_entity = 'Boba Fett'
            elif current_entity in ['Sep','Septist']:
                current_entity = 'Separatist'
            elif current_entity in ['Federation']:
                current_entity = 'Trade Federation'
            elif current_entity in ['Jab']:
                current_entity = 'Jabba'
            elif current_entity in ['Jed']:
                current_entity = 'Jedi'
            
            reconstructed_entities[current_type].append(current_entity)
    # Output the reconstructed entities
    return reconstructed_entities


In [164]:
def nerInference(script):
    script_sentences = nltk.sent_tokenize(script)
    
    ner_output = []
    for sentence in script_sentences:
        doc = ner_pipeline(sentence)
        ners = set()
        output = reconstructEntities(doc)
        for entity_type, entities in output.items():
            ners.update(entities)
        ner_output.append(ners)
    return ner_output

In [89]:
script = df.iloc[1]['script']
script_sentences = nltk.sent_tokenize(script)

docs = ner_pipeline(script_sentences)
ner_output = reconstructEntities(docs)


In [169]:
df['ners'] = df['script'].apply(nerInference)

In [1]:
df.iloc[0]['ners']

NameError: name 'df' is not defined

In [137]:
pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-3.2.2-py3-none-any.whl.metadata (7.2 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading jsonpickle-3.2.2-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jsonpickle, pyvis
Successfully installed jsonpickle-3.2.2 pyvis-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [138]:
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [177]:
list(df.iloc[0]['ners'][10])

['Anakin Skywalker']

In [198]:
def generate_characterNetwork(df):
    
    window_size = 10
    entity_relationship = []
    # iterate over each script
    for row in df['ners']:
        previous_entities_in_window = []
        # iterate over the named entities in each sentence 
        for sentence in row:
            # append list of named entities from the current sentence 
            # (now the last element of previous entities in window) and only keep the sentences
            # in the last 'window size' sentences
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-window_size:]
            # flattens the list of lists to a single list of entries 
            previous_entities_flattened = sum(previous_entities_in_window, [])
            # loops through the entities current sentence 
            for entity in sentence:
                if len(entity) > 2:
                    for entity_in_window in previous_entities_flattened:
                        if len(entity_in_window) > 2:
                            # for every entity in the current sentence we append the pair
                            # entity, entity_in_window for all entities not equal to entity in window
                            if entity != entity_in_window:
                                # sort so A,B pairs are the same as B,A oaurs
                                entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    # relationship_df initially has value column which is just pairs [entity, entity_in_window]
    # lambda function creates new columns 'source' for entity and 'target' for entity_in_window
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    # groupby groups the dataframe by the unique pairs of source, targets
    # the count after this replaces the value column initially containing the pairs
    # to a count of how many times each originally appeared in the data
    relationship_df = relationship_df.groupby(['source','target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value',ascending=False)
    
    return relationship_df
            

In [199]:
relationship_df = generate_characterNetwork(df)

In [200]:
relationship_df = relationship_df.head(200)

In [203]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)

node_degree = dict(G.degree())
net = Network(notebook=True, width = '1000px', height = '700px', bgcolor='#222222', font_color='white', cdn_resources='remote')

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show('starwars.html')

starwars.html
