In [197]:
import pandas as pd
import numpy as np
import re

import spacy
from spacy import displacy

In [198]:
spacy_model = spacy.load("en_core_web_lg")

In [199]:
doc_raw = """
Speaker1: Hello, I’d like to introduce myself. My name is Alice Roberts, and I’m the head of regional sales.  I’ll be chairing this meeting today. The first thing I’d like to do is go round the room and have everyone introduce themselves.

Speaker2: Hi, I’m David Smith and I work in book sales alongside Sally here.

Speaker3: Thanks David. So as David said, I’m Sally, Sally Jones, I’ve been in book sales for 10 years now.

Speaker4: Nice to meet everyone, although I already know Alice. Anyway, Paul Owens, that’s me. I’m interested in buying some of your stuff, especially those new Richard Osman books. Hopefully we can do business together.

Speaker1: Thanks everyone, and yes just to bring David and Sally up to speed Paul and I had an initial meeting last week.

Speaker3: Ah, that explains why we’re here!

Speaker2: Yes.
"""
doc = spacy_model(doc_raw)

In [200]:
displacy.render(doc, style="ent", jupyter=True)

In [201]:
displacy.render(doc, style="dep", jupyter=True)

In [202]:
for ent in doc.ents:
    if ent.label_ == "PERSON":
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Alice Roberts 59 72 PERSON
David Smith 259 270 PERSON
Sally 306 311 PERSON
David 336 341 PERSON
David 349 354 PERSON
Sally 365 370 PERSON
Sally Jones 372 383 PERSON
Alice 485 490 PERSON
Paul Owens 500 510 PERSON
Richard Osman 589 602 PERSON
David 699 704 PERSON
Sally 709 714 PERSON
Paul 727 731 PERSON


In [203]:
list(doc.ents)

[Alice Roberts,
 today,
 first,
 David Smith,
 Sally,
 David,
 David,
 Sally,
 Sally Jones,
 10 years,
 Nice,
 Alice,
 Paul Owens,
 Richard Osman,
 David,
 Sally,
 Paul,
 last week]

In [204]:
p = re.compile('(Speaker\d*)')
speaker_list = p.findall(str(doc))
unique_speakers = list(set(speaker_list))
print(speaker_list)
unique_speakers

['Speaker1', 'Speaker2', 'Speaker3', 'Speaker4', 'Speaker1', 'Speaker3', 'Speaker2']


['Speaker3', 'Speaker2', 'Speaker1', 'Speaker4']

In [205]:
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)

In [206]:
pre_patterns_for_introduction = [
    "my name is",
    "i am",
    "i’m",
    "they call me",
    "i’m called",
    "i am called",
]
post_patterns_for_introduction = [
    "that is me",
    "that’s me",
    "that is me",
    "that would be me",
]

In [207]:
unknown_speakers = list(unique_speakers)

In [208]:
names = []
for ent in doc.ents:
    if ent.label_ == "PERSON":
        names.append(ent.text)
names = list(set(names))

full_names = [x.lower() for x in names if " " in x]
single_names = [x.lower() for x in names if x not in full_names]

# double cross names to use full names
filtered_single_names = []
for sn in single_names:
    found_it = False
    for fn in full_names:
        if sn in fn:
            found_it = True
            break
    if not found_it:
        filtered_single_names.append(sn)

single_names = filtered_single_names
del filtered_single_names
names = [
    *full_names,
    *single_names
]
names

['david smith', 'paul owens', 'sally jones', 'alice roberts', 'richard osman']

In [209]:
doc_raw_lines = doc_raw.split("\n")
context = {}
for ln in doc_raw_lines:
    this_name = None
    this_label = None
    if not ln:
        continue
    tokens = ln.lower()
    # check pre-patterns
    for pattern in pre_patterns_for_introduction:
        if pattern in tokens:
            # idx = tokens.index(pattern)
            # remaining = tokens[idx:]
            remaining = tokens.split(pattern)[1]
            potentiel_name = remaining.split(" ")[:4]
            potentiel_name = " ".join(potentiel_name)
            # find name 
            for n in names:
                if n in potentiel_name:
                    this_name = n
                    this_name = this_name.split(" ")
                    this_name = [(tmp_char[0].upper() + tmp_char[1:]) for tmp_char in this_name]
                    this_name = " ".join(this_name)
                    del names[names.index(n)]
                    break
            # find label
            for l in unique_speakers:
                if l.lower() in tokens:
                    this_label = l
                    this_label = this_label.replace("speaker", "Speaker")
                    del unique_speakers[unique_speakers.index(l)]
                    break
            if this_label and this_name:
                context[this_label] = this_name
                this_name = None
                this_label = None
            break
    # check post-patterns
    for pattern in post_patterns_for_introduction:
        if pattern in tokens:
            remaining = tokens.split(pattern)[0]
            potentiel_name = remaining.split(" ")[-4:-1]
            potentiel_name = " ".join(potentiel_name)
            # find name 
            for n in names:
                if n in potentiel_name:
                    this_name = n
                    this_name = this_name.split(" ")
                    this_name = [(tmp_char[0].upper() + tmp_char[1:]) for tmp_char in this_name]
                    this_name = " ".join(this_name)
                    del names[names.index(n)]
                    break
            # find label
            for l in unique_speakers:
                if l.lower() in tokens:
                    this_label = l
                    this_label = this_label.replace("speaker", "Speaker")
                    del unique_speakers[unique_speakers.index(l)]
                    break
            if this_label and this_name:
                context[this_label] = this_name
                this_name = None
                this_label = None
            break
context

{'Speaker1': 'Alice Roberts',
 'Speaker2': 'David Smith',
 'Speaker3': 'Sally Jones',
 'Speaker4': 'Paul Owens'}

In [212]:
for k, v in context.items():
    doc_raw = doc_raw.replace(k, v)
print(doc_raw)


Alice Roberts: Hello, I’d like to introduce myself. My name is Alice Roberts, and I’m the head of regional sales.  I’ll be chairing this meeting today. The first thing I’d like to do is go round the room and have everyone introduce themselves.

David Smith: Hi, I’m David Smith and I work in book sales alongside Sally here.

Sally Jones: Thanks David. So as David said, I’m Sally, Sally Jones, I’ve been in book sales for 10 years now.

Paul Owens: Nice to meet everyone, although I already know Alice. Anyway, Paul Owens, that’s me. I’m interested in buying some of your stuff, especially those new Richard Osman books. Hopefully we can do business together.

Alice Roberts: Thanks everyone, and yes just to bring David and Sally up to speed Paul and I had an initial meeting last week.

Sally Jones: Ah, that explains why we’re here!

David Smith: Yes.

