In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch, nltk

## Load Model

In [81]:
model_ckpt = 'dslim/bert-large-NER'
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

def load_model(task, model_ckpt, device):
    
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForTokenClassification.from_pretrained(model_ckpt)
    
    task_pipeline = pipeline(task=task,model=model,tokenizer=tokenizer,device=device)
    return task_pipeline

ner_pipeline = load_model('ner',model_ckpt,device)

tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [82]:
import pandas as pd
subtitle_path = '../data/subtitles.txt'

def datasetprocessing(path):
    dialogue = []
    with open(subtitle_path,'r') as file:
        lines = file.readlines()
        lines = lines[1:]
        for line in lines:
            quote = line.split(',')[2]
            dialogue.append(quote.replace('"',''))
        script = " ".join(dialogue)
    return script

script = datasetprocessing(subtitle_path)

In [83]:
script

"Did you hear that?  They've shut down the main reactor.  We'll be destroyed for sure.  This is madness!\n We're doomed!\n There'll be no escape for the Princess this time.\n What's that?\n I should have known better than to trust the logic of a half-sized thermocapsulary dehousing assister...\n Hurry up!  Come with me!  What are you waiting for?!  Get in gear!\n Artoo! Artoo-Detoo At last!  Where have you been?\n They're heading in this direction. What are we going to do?  We'll be sent to the spice mines of Kessel or smashed into who knows what!\n Wait a minute The Death Star plans are not in the main computer.\n Where are those transmissions you intercepted?\n We intercepted no transmissions. Aaah...  This is a consular ship. Were on a diplomatic mission.\n If this is a consular ship... where is the Ambassador?\n Commander There she is! Set for stun!\n She'll be all right. Inform Lord Vader we have a prisoner.\n Hey Don't call me a mindless philosopher Secret mission?  What plans?  

In [84]:
script_sentences = nltk.sent_tokenize(script)

In [85]:
docs = ner_pipeline(script_sentences)

In [86]:
docs

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'entity': 'B-ORG',
   'score': 0.38675198,
   'index': 1,
   'word': 'Art',
   'start': 0,
   'end': 3},
  {'entity': 'I-ORG',
   'score': 0.33464313,
   'index': 2,
   'word': '##oo',
   'start': 3,
   'end': 5}],
 [{'entity': 'B-PER',
   'score': 0.9732834,
   'index': 1,
   'word': 'Art',
   'start': 0,
   'end': 3},
  {'entity': 'I-PER',
   'score': 0.5314915,
   'index': 2,
   'word': '##oo',
   'start': 3,
   'end': 5},
  {'entity': 'B-PER',
   'score': 0.9635247,
   'index': 4,
   'word': 'Det',
   'start': 6,
   'end': 9},
  {'entity': 'I-PER',
   'score': 0.50105655,
   'index': 5,
   'word': '##oo',
   'start': 9,
   'end': 11}],
 [],
 [],
 [],
 [{'entity': 'B-LOC',
   'score': 0.9962553,
   'index': 12,
   'word': 'Ke',
   'start': 36,
   'end': 38},
  {'entity': 'I-LOC',
   'score': 0.9713103,
   'index': 13,
   'word': '##ssel',
   'start': 38,
   'end': 42}],
 [{'entity': 'B-ORG',
   'score': 0.89310354,
   'i

In [91]:
reconstructed_loc = []
current_entity = ''

for sent in docs:
    if len(sent) > 0:
        for entity in sent: 
            word = entity['word']
            entity_tag = entity['entity']
            
            if entity_tag == 'B-LOC':
                if current_entity:
                    if word.startswith('##'):
                        current_entity += word[2:]
                    else:
                        reconstructed_loc.append(current_entity)
                        current_entity = word
                else:
                    current_entity = word
            elif entity_tag == 'I-LOC':
                current_entity += word[2:] if word.startswith('##') else f" {word}"
            
if current_entity:
    reconstructed_loc.append(current_entity)


In [93]:
reconstructed_loc

['Kessel',
 'Alderaan',
 'Anchorhead',
 'Toshi Station',
 'Empire',
 'Anchorhead',
 'Jundland Wastes',
 'Old Republic',
 'Alderaan',
 'Alaan',
 'Alaan',
 'Alderaan',
 'Alderaan',
 'Alderaania',
 'Alaan',
 'Alderaan',
 'Alderaan',
 'Dantooine',
 'Alderaan',
 'Danoine',
 'Clear Bay',
 'Alderaan',
 'Mos Eisley',
 'TX',
 'Alderaan',
 'Alaan',
 'Yavin',
 "Beggar ' s Canyon We",
 'Echo Base',
 'Empire',
 'Hoth',
 'Empire',
 'Art',
 'Mu',
 'Ha',
 'Empire',
 'Empireiaia',
 'Jabba the Hutt',
 'Tatooineoo',
 'J',
 'Dune Sea',
 'Carkoon',
 'Moon of',
 'Endor',
 'Endor',
 'Sullustia',
 'Endor Moonia',
 'Endor']

In [None]:
reconstructed_entities = []
current_entity = ''

for sent in docs:
    if len(sent) > 0:
        for entity in sent: 
            word = entity['word']
            entity_tag = entity['entity']
            
            if entity_tag == 'B-PER':
                if current_entity:
                    if word.startswith('##'):
                        current_entity += word[2:]
                    else:
                        reconstructed_entities.append(current_entity)
                        current_entity = word
                else:
                    current_entity = word
            elif entity_tag == 'I-PER':
                current_entity += word[2:] if word.startswith('##') else f" {word}"
            
if current_entity:
    reconstructed_entities.append(current_entity)

In [88]:
reconstructed_entities

['Artoo',
 'Detoo Vader',
 'Wormie',
 'Big',
 'Biggs Darklighter',
 'Lord Vader',
 'Lord Vader',
 'Luke',
 'Mos Eisley',
 'Luke',
 'Artoo',
 'Det',
 'Luke',
 'Luke',
 'Owen',
 'Owen',
 'Luke',
 'See Threepio',
 'Obi - Wan Kenobi',
 'Obi - Wan Kenobi',
 'Ben Kenobi',
 'Luke',
 'Luke',
 'Ben',
 'Obi - Wan',
 'Owen',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'Ben Kenobi',
 'Sir Sure Luke',
 'Obi - Wan Kenob',
 'Ben',
 'Ben Kenob',
 'Oh',
 'Obi Wan Kenobi',
 'Obi Wan',
 'Darth Vader',
 'Kenobi',
 'Vader Vader',
 'Art',
 'Threepio',
 'J Owen Be',
 'Owen',
 'Chewbacca',
 'Han Solo',
 'Chewie',
 'Jabba',
 'Jabba',
 'Jabba',
 'Jab',
 'Le Luke',
 'Han You',
 'Greedo',
 'Han',
 'Han',
 'Han Jabba',
 'Chewie Oh',
 'Tarkin Charming',
 'Leia',
 'Lord Vader',
 'Chewie',
 'Ben',
 'Leia',
 'Rich',
 'Three',
 'Han Luke',
 'Luke',
 'Luke Skywalker',
 'Ben Kenobi',
 'Ben Kenobi',
 'Obi - Wan Kenobi',
 'Tarkin',
 'Obi - Wan',
 'Three',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'Luke',
 'T

In [30]:
for sentence in script_sentences:
    doc = ner_pipeline(sentence)
    

In [40]:
ner_output = []
for sent in doc:
    if len(sent) > 0:
        if 'PER' in sent['entity']:
            ner_output.append((sent['entity'],sent['word']))

TypeError: list indices must be integers or slices, not str

In [41]:
dict_ = {'entity': 'B-ORG',
   'score': 0.7749182,
   'index': 1,
   'word': 'Art',
   'start': 0,
   'end': 3}

In [43]:
'ORG' in dict_['entity']

True

In [35]:
doc

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'entity': 'B-ORG',
   'score': 0.7749182,
   'index': 1,
   'word': 'Art',
   'start': 0,
   'end': 3},
  {'entity': 'I-ORG',
   'score': 0.5198912,
   'index': 2,
   'word': '##oo',
   'start': 3,
   'end': 5}],
 [{'entity': 'B-PER',
   'score': 0.61526746,
   'index': 1,
   'word': 'Art',
   'start': 0,
   'end': 3},
  {'entity': 'B-PER',
   'score': 0.9085649,
   'index': 4,
   'word': 'Det',
   'start': 6,
   'end': 9}],
 [],
 [],
 [],
 [{'entity': 'B-LOC',
   'score': 0.99421966,
   'index': 12,
   'word': 'Ke',
   'start': 36,
   'end': 38},
  {'entity': 'I-LOC',
   'score': 0.7407636,
   'index': 13,
   'word': '##ssel',
   'start': 38,
   'end': 42}],
 [{'entity': 'B-ORG',
   'score': 0.59782326,
   'index': 5,
   'word': 'Death',
   'start': 18,
   'end': 23},
  {'entity': 'I-ORG',
   'score': 0.9461144,
   'index': 6,
   'word': 'Star',
   'start': 24,
   'end': 28}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 

In [71]:
pip install spacy

Collecting spacy
  Downloading spacy-3.7.6.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.2 k

In [74]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [78]:
!pip install https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl

# Using spacy.load().
import spacy
nlp_model = spacy.load("en_core_web_trf")


Collecting en-core-web-trf==any
  Downloading https://huggingface.co/spacy/en_core_web_trf/resolve/main/en_core_web_trf-any-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


ValueError: [E002] Can't find factory for 'curated_transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer