<a href="https://colab.research.google.com/github/satpremrath/NLP/blob/NER_DEV/spacy_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = '''Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.'''

In [4]:
doc = nlp(text)

In [5]:
for ent in doc.ents:
  print(f"{ent.text} \t {ent.label_}")

Dursley 	 PERSON
Grunnings 	 ORG
Dursley 	 PERSON
Dursleys 	 PERSON
Dudley 	 PERSON


In [6]:
def load_data(file):
  with open(file, 'r', encoding= 'utf-8') as f:
    data = json.load(f)
  return data

In [7]:
hp_chars = load_data('./hp_characters.json')
print(hp_chars)

['Hannah Abbott', 'Ludo Bagman', 'Bathilda Bagshot', 'Katie Bell', 'Cuthbert Binns', 'Phineas Nigellus Black', 'Sirius Black', 'Walburga Black', 'Amelia Bones', 'Susan Bones', 'Terry Boot', 'Lavender Brown', 'Millicent Bulstrode', 'Charity Burbage', 'Frank Bryce', 'Alecto Carrow', 'Amycus Carrow', 'Reginald Cattermole', 'Mary Cattermole', 'Cho Chang', 'Penelope Clearwater', 'Mrs. Cole', 'Michael Corner', 'Crabbe', 'Vincent Crabbe', 'Colin Creevey', 'Dennis Creevey', 'Dirk Cresswell', 'Barty Crouch Sr', 'Barty Crouch Jr', 'Roger Davies', 'John Dawlish', 'Fleur Delacour', 'Gabrielle Delacour', 'Dedalus Diggle', 'Amos Diggory', 'Cedric Diggory', 'Elphias Doge', 'Antonin Dolohov', 'Aberforth Dumbledore', 'Albus Dumbledore', 'Ariana Dumbledore', 'Kendra Dumbledore', 'Percival Dumbledore', 'Dudley Dursley', 'Marge Dursley', 'Petunia Dursley', 'Vernon Dursley', 'Marietta Edgecombe', 'Arabella Figg', 'Argus Filch', 'Justin Finch-Fletchley', 'Seamus Finnigan', 'Marcus Flint', 'Mundungus Fletche

In [8]:
def generate_better_characters(file):
  data = load_data(file)
  print(f"Original data: {len(data)}")
  new_characters = []
  for item in data:
    new_characters.append(item)
  for item in data:
    item = item.replace("The","").replace("the","").replace("and","").replace("And","")
    names = item.split(" ")
    for name in names:
      name = name.strip()
      new_characters.append(name)
    if "(" in item:
      names = item.split("(")
      for name in names:
        name = name.replace(")","").strip()
        new_characters.append(name)
    if "," in item:
      names = item.split(",")
      for name in names:
        name = name.replace("and","").strip()
        if " " in name:
          new_names = name.split()
          for x in new_names:
            x = x.strip()
            new_characters.append(x)
        new_characters.append(name)
  print(f"Cleaned data: {len(new_characters)}")
  final_characters = []
  titles = ["Dr.","Professor","Mr.","Mrs.","Ms.","Miss","Aunt","Uncle","Mr. and Mrs."]
  for character in new_characters:
    if "" != character:
      final_characters.append(character)
      for title in titles:
        titled_char = f"{title} {character}"
        final_characters.append(titled_char)
  print(f"Corpus after adding titles: {len(final_characters)}")
  final_characters = list(set(final_characters))
  print(f"Corpus after removing duplicates: {len(final_characters)}")
  return final_characters


In [9]:
characters = generate_better_characters('./hp_characters.json')

Original data: 207
Cleaned data: 622
Corpus after adding titles: 6160
Corpus after removing duplicates: 5119


In [10]:
def create_training_data(file, type):
  data = generate_better_characters(file)
  patterns = []
  for item in data:
    pattern = {
        "label": type,
        "pattern": item
    }
    patterns.append(pattern)
  return patterns


In [11]:
create_training_data('./hp_characters.json',"PERSON")

Original data: 207
Cleaned data: 622
Corpus after adding titles: 6160
Corpus after removing duplicates: 5119


[{'label': 'PERSON', 'pattern': 'Miss Aurora'},
 {'label': 'PERSON', 'pattern': 'Aunt Flint'},
 {'label': 'PERSON', 'pattern': 'Mr. and Mrs. Gregory Goyle'},
 {'label': 'PERSON', 'pattern': 'Aunt Peter Pettigrew'},
 {'label': 'PERSON', 'pattern': 'Mrs. Helena'},
 {'label': 'PERSON', 'pattern': 'Mr. and Mrs. Bill Weasley'},
 {'label': 'PERSON', 'pattern': 'Aunt Justin'},
 {'label': 'PERSON', 'pattern': 'Miss Griphook'},
 {'label': 'PERSON', 'pattern': 'Mrs. Sirius'},
 {'label': 'PERSON', 'pattern': 'Miss Terry Boot'},
 {'label': 'PERSON', 'pattern': 'Uncle Ginny Weasley'},
 {'label': 'PERSON', 'pattern': 'Dr. Warren'},
 {'label': 'PERSON', 'pattern': 'Professor Cormac'},
 {'label': 'PERSON', 'pattern': 'Uncle Antioch, Cadmus, and Ignotus Peverell'},
 {'label': 'PERSON', 'pattern': 'Dr. (Tom'},
 {'label': 'PERSON', 'pattern': 'Professor Colin'},
 {'label': 'PERSON', 'pattern': 'Mrs. Luna Lovegood'},
 {'label': 'PERSON', 'pattern': 'Professor Clearwater'},
 {'label': 'PERSON', 'pattern': 

In [12]:
def generate_rules(patterns):
  nlp = English()
  ruler = nlp.add_pipe('entity_ruler')
  ruler.add_patterns(patterns)
  nlp.to_disk("hp_ner")

In [13]:
patterns = create_training_data('./hp_characters.json',"PERSON")
generate_rules(patterns= patterns)

Original data: 207
Cleaned data: 622
Corpus after adding titles: 6160
Corpus after removing duplicates: 5119


In [14]:
nlp = spacy.load("hp_ner")
print(nlp)

<spacy.lang.en.English object at 0x79c69c1eff40>


In [29]:
def test_model(model, text):
  doc = model(text)
  results = []
  for ent in doc.ents:
    results.append(ent.text)
  return results

In [37]:
with open("./hp.txt","r") as f:
  text = f.read()

In [38]:
chapters = text.split("CHAPTER")[1:]
print(chapters)
ie_data = {}
for chapter in chapters:
  chapter_num, chapter_title = chapter.split("\n\n")[0:2]
  chapter_num = chapter_num.strip()
  # print(chapter_num)
  segments = chapter.split("\n\n")[2:]

  hits = []
  for segment in segments:
    segment = segment.strip()
    segment = segment.replace("\n"," ")
    results = test_model(nlp, segment)

    for result in results:
      hits.append(result)

  ie_data[chapter_num] = hits


["Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.", {'entities': [(0, 20, 'PERSON')]}]
['Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.', {'entities': [(0, 11, 'PERSON'), (166, 178, 'PERSON'), (394, 400, 'PERSON')]}]
["The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could

In [32]:
print(ie_data)

{'ONE': ['Mr. and Mrs. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Dudley', 'Mrs. Potter', 'Mrs. Dursley', 'Mrs. Dursley', 'Dudley', 'Mr. and Mrs. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Dudley', 'Mr. Dursley', 'Mrs. Dursley', 'Dudley', 'Dudley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Harry', 'Mr. Dursley', 'Potter', 'Potter', 'Harry', 'Harry', 'Mrs. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Mrs.', 'Dudley', 'Mr. Dursley', 'Dudley', 'Ted', 'Mr. Dursley', 'Mrs. Dursley', 'Petunia', 'Mrs. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Mrs. Dursley', 'Mr. Dursley', 'Potter', 'Dudley', 'Mrs. Dursley', 'Harry', 'Mr. Dursley', 'Mrs. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Mr. Dursley', 'Mrs. Dursley', 'Petunia', 'Petunia', 'Mr. Dursley', 'Albus Dumbledore', 'Albus Dumbledore', 'Mrs. Dursley', 'Dumbledore', 'Pro

In [19]:
from textwrap import indent
def save_data(file, data):
  with open(file,'w', encoding = 'utf-8') as f:
    data = json.dump(data, f, indent = 4)

In [20]:
save_data('hp_data.json', ie_data)

NER Training Set

In [22]:
# TRAIN_DATA = [(text, {"entities":[(start,end,label)]})]

In [44]:
TRAIN_DATA = []

In [45]:
nlp = spacy.load("hp_ner")
print(nlp)

<spacy.lang.en.English object at 0x79c69cf7f2b0>


In [46]:
def test_model(model, text):
  doc = model(text)
  results = []
  entities = []
  for ent in doc.ents:
    entities.append((ent.start_char, ent.end_char, ent.label_))
  if len(entities) > 0:
    results = [text, {"entities": entities}]
    return results

In [47]:
with open("./hp.txt","r") as f:
  text = f.read()

In [48]:
chapters = text.split("CHAPTER")[1:]
for chapter in chapters:
  chapter_num, chapter_title = chapter.split("\n\n")[0:2]
  chapter_num = chapter_num.strip()
  segments = chapter.split("\n\n")[2:]

  for segment in segments:
    segment = segment.strip()
    segment = segment.replace("\n"," ")
    results = test_model(nlp, segment)
    if results != None:
      TRAIN_DATA.append(results)


In [49]:
print(TRAIN_DATA[0])

["Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.", {'entities': [(0, 20, 'PERSON')]}]


In [50]:
save_data("hp_training_data.json", TRAIN_DATA)
print(f"{len(TRAIN_DATA)}")

2213


TRAIN NER MODEL