In [1]:
import pandas as pd

from functools import partial

import spacy
from spacy import displacy
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.tokens import Span
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df = pd.read_json('/Users/rishinigam/kaggle_competitions/PII_data_detection/datasets/train.json')
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [4]:
df['full_text_lower'] = df['full_text'].str.lower()

- Span in spaCy refers to a contiguous sequence of tokens within a document

In [5]:
# Visualising
def visualize_label(df):
    start_pos = -1
    span_infos = []
    for label_idx, label in enumerate(df['labels'].iloc[0]):
        start_pos = label_idx
        end_pos = start_pos + 1
        span_dict = {'start_pos': start_pos, 'end_pos': end_pos, 'label': label}
        span_infos.append(span_dict)
    
    doc_spans = []
    nlp_blanks = spacy.blank('en')
    doc = nlp_blanks(df['full_text_lower'].to_list()[0])
    for span_info in span_infos:
        _span = Span(doc=doc, start=span_info['start_pos'], end=span_info['end_pos'], label=span_info['label'])
        doc_spans.append(_span)
    doc.spans['sc'] = doc_spans
    color_map = {'B-EMAIL': '#2fc3da',
                 'B-ID_NUM': '#1c0cfa',
                 'B-NAME_STUDENT':'#e01d82',
                 'B-PHONE_NUM': '#ebe70e',
                 'B-STREET_ADDRESS':'#f2860a',
                 'B-URL_PERSONAL': '#c9f211',
                 'B-USERNAME': '#0eebb7',
                 'I-ID_NUM': '#8e87ed',
                 'I-NAME_STUDENT':'#eb83b9',
                 'I-PHONE_NUM': '#e6e4a1',
                 'I-STREET_ADDRESS': '#f2c694',
                 'I-URL_PERSONAL':'#e5f2ac',
                 'O': '#eeff00'}
    options = {'colors': color_map}

    displacy.render(doc, style = 'span', options = options)

visualize_label(df[1:2])

In [6]:
df = df.head(10)

In [7]:
df.shape

(10, 6)

In [8]:
txt = 'this is some random text, to play around spaCy'
nlp_txt = nlp(txt)
type(nlp_txt)


spacy.tokens.doc.Doc

In [9]:
[token.text for token in nlp_txt]

['this', 'is', 'some', 'random', 'text', ',', 'to', 'play', 'around', 'spaCy']

### Sentence Detection

In [10]:
def total_sentences(x):
    para = nlp(x)
    sentences = list(para.sents)
    return len(sentences)

df['total_length_sentences'] = df['full_text_lower'].astype(str).apply(total_sentences)

In [11]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",reporting process\n\nby gilberto gamboa\n\ncha...,16
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",design thinking for innovation\n\nsindy samaca...,20
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",assignment: visualization reflection submitt...,30


In [12]:
#df.to_csv('/Users/rishinigam/t81_588_course/datasets/pii_data.csv', index=False)

- Adding '\n' as a cutom boundary

In [13]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    '''
    Adding \n to use as a delimeter for a sentence detection too
    '''
    for token in doc[:-1]:
        if "\n" in token.text:
            doc[token.i + 1].is_sent_start = True
    return doc

def total_sentences_v2(x):
    para = custom_nlp(x)
    sentences = list(para.sents)
    return len(sentences)

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [14]:
df['total_sentences'] = df['full_text_lower'].astype(str).apply(total_sentences_v2)

In [15]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",reporting process\n\nby gilberto gamboa\n\ncha...,16,66
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",design thinking for innovation\n\nsindy samaca...,20,111
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",assignment: visualization reflection submitt...,30,31


### Spacy tokenization

- Generally for custom/new tokenizer provide with:
    - Vocab: storage container for special cases, which is used to handle cases like contractions and emoticons
    - prefix_search: handle preceding punctuation as opening paraentheses
    - suffix_search: handles succeeding punctuation as closing parenthese
    - infix_finditer: a function that handles non-whitespace separators, such as hyphens
    - token_match : boolean function that matches strings that should never be split. 

In [16]:
# taking xxxxx@xxxx as one token
def tokenizing_email(x):
    prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
    custom_infixes = [r"@"]
    infix_re = spacy.util.compile_infix_regex(list(custom_nlp.Defaults.infixes) + custom_infixes)

    custom_nlp.tokenizer = Tokenizer(
        custom_nlp.vocab,
        prefix_search = prefix_re.search,
        suffix_search = suffix_re.search,
        infix_finditer = infix_re.finditer,
        token_match=None
    )

    custom_tokenizer_doc = custom_nlp(x)
    return custom_tokenizer_doc


In [17]:
df['new_tokens'] = df['full_text_lower'].astype(str).apply(tokenizing_email)

In [18]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig..."


### Stop Words

- Most common words in a language

In [19]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(len(spacy_stopwords))

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

326
might
same
how
six
enough
or
‘re
do
forty
latter


In [20]:
# removing stopwords from the new_tokens
def remove_stop_words(x, nlp):
    doc = nlp(x)
    return [token for token in doc if not token.is_stop]

df['wo_stp_wrd_tken'] = df['new_tokens'].apply(partial(remove_stop_words, nlp=nlp))

In [21]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig..."


### Lemmatization

- getting to the meaningful root word
- eg: organizes, organized, organizing have the root word as organize which is the lemma

In [22]:
def lemmatiz(col, nlp):
    doc = nlp(col)
    return [token for token in doc]

df['lemmatized'] = df['full_text_lower'].apply(partial(lemmatiz, nlp=nlp))

In [23]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken,lemmatized
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a...","[design, thinking, for, innovation, reflexion,..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig..."


### Word Frequency

- Gives you count, most common words, etc

### POS Tagging

- Generally 8 types of Part of speech are used for tagging

In [24]:
def pos_tagger(x):
    nlp = spacy.load("en_core_web_sm")
    token = []
    pos = []
    for sent in nlp.pipe(x):
        token.append([word.text for word in sent])
        pos.append([word.pos_ for word in sent])

    return token, pos

df['tokens_tagger'], df['pos_tagger'] = pos_tagger(df['full_text_lower'])

In [25]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken,lemmatized,tokens_tagger,pos_tagger
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a...","[design, thinking, for, innovation, reflexion,...","[design, thinking, for, innovation, reflexion,...","[NOUN, NOUN, ADP, NOUN, NOUN, PUNCT, NOUN, NUM..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[PROPN, PROPN, SPACE, NOUN, NOUN, NOUN, SPACE,..."


### Preprocessing as a function

In [26]:
# preprocess with stop words, punctuation symbols and lemmatized lowercase tokens
def token_allowed(token):
    return bool(token and str(token).strip() and not token.is_stop and not token.is_punct)

def preprocess_token(token):
    return token.lemma_.strip().lower()

preprocessed = []
for text in df['full_text']:
    doc = nlp(text)
    preprocessed_token = [preprocess_token(token) for token in doc if token_allowed(token)]
    preprocessed.append(preprocessed_token)

df['preprocessed'] = preprocessed

In [27]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken,lemmatized,tokens_tagger,pos_tagger,preprocessed
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a...","[design, thinking, for, innovation, reflexion,...","[design, thinking, for, innovation, reflexion,...","[NOUN, NOUN, ADP, NOUN, NOUN, PUNCT, NOUN, NUM...","[design, thinking, innovation, reflexion, avri..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[PROPN, PROPN, SPACE, NOUN, NOUN, NOUN, SPACE,...","[diego, estrada, design, thinking, assignment,..."


### Rule-Based Matching

In [28]:
# checking existing labels vs pos_tagger
l1 = df.labels[:1].to_list()
l1_lemma = df.lemmatized[:1].to_list()
l1_pos = df.pos_tagger[:1].to_list()
l1_preprocess = df.tokens[:1].to_list()
print(l1[0][9], l1[0][10])
print(l1_lemma[0][9], l1_lemma[0][10])
print(l1_pos[0][9], l1_pos[0][10])
print(l1_preprocess[0][9], l1_preprocess[0][10])

B-NAME_STUDENT I-NAME_STUDENT
nathalie sylla
NOUN NOUN
Nathalie Sylla


#### Extracting full name tagged as PROPN

In [37]:
def full_name(txt, nlp):
    doc = nlp(txt)
    matcher = Matcher(nlp.vocab)
    pattern = [{
        "POS": "PROPN",
        "POS": "PROPN"
    }]
    matcher.add("Full_Name", [pattern])
    matches = matcher(doc)
    full_names = []
    for _, start, end in matches:
        span = doc[start:end]
        full_names.append(span.text)
    return full_names

df['full_names'] = df['full_text'].apply(partial(full_name, nlp=nlp))

In [38]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken,lemmatized,tokens_tagger,pos_tagger,preprocessed,full_names
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a...","[design, thinking, for, innovation, reflexion,...","[design, thinking, for, innovation, reflexion,...","[NOUN, NOUN, ADP, NOUN, NOUN, PUNCT, NOUN, NUM...","[design, thinking, innovation, reflexion, avri...","[Thinking, Avril, Nathalie, Sylla, Challenge, ..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[PROPN, PROPN, SPACE, NOUN, NOUN, NOUN, SPACE,...","[diego, estrada, design, thinking, assignment,...","[Diego, Estrada, Design, Thinking, Assignment,..."


* Still nead cleaning though

#### Extracting phone numbers

In [41]:
def extract_phone_nums(text, nlp):
    doc = nlp(txt)
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"ORTH": "("},
        {"SHAPE": "ddd"},
        {"ORTH": ")"},
        {"SHAPE": "ddd"},
        {"IS_SPACE": True, "OP": "?"},
        {"SHAPE": "ddd"},
        {"ORTH": "-"},
        {"SHAPE": "ddd"},
    ]
    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(doc)
    ph_nums = []
    for match_id, start, end in matches:
        span = doc[start:end]
        ph_nums.append(span.text)
    
    return ph_nums

df['phone_nums'] = df['full_text_lower'].apply(partial(extract_phone_nums, nlp=nlp))

In [42]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,full_text_lower,total_length_sentences,total_sentences,new_tokens,wo_stp_wrd_tken,lemmatized,tokens_tagger,pos_tagger,preprocessed,full_names,phone_nums
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",design thinking for innovation reflexion-avril...,32,57,"(design, thinking, for, innovation, reflexion,...","[design, thinking, innovation, reflexion, -, a...","[design, thinking, for, innovation, reflexion,...","[design, thinking, for, innovation, reflexion,...","[NOUN, NOUN, ADP, NOUN, NOUN, PUNCT, NOUN, NUM...","[design, thinking, innovation, reflexion, avri...","[Thinking, Avril, Nathalie, Sylla, Challenge, ...",[]
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",diego estrada\n\ndesign thinking assignment\n\...,26,36,"(diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[diego, estrada, \n\n, design, thinking, assig...","[PROPN, PROPN, SPACE, NOUN, NOUN, NOUN, SPACE,...","[diego, estrada, design, thinking, assignment,...","[Diego, Estrada, Design, Thinking, Assignment,...",[]
