In [1]:
import re
from collections import defaultdict

In [2]:
example_text = 'I can\'t, cant, cannot, won\'t, wont do it!'

In [3]:
import spacy

from spacy.tokens import Doc
from spacy.language import Language

%html

<h3>Control Group:</h3>

In [4]:
nlp_1 = spacy.load('en_core_web_sm') ## no expansion, control

In [5]:
left_only = nlp_1(example_text)

hash_lookup_1 = defaultdict(int)
for tk in left_only:
    hash_lookup_1[tk.text] += 1

%html

<h3>Test Group:</h3>

In [6]:
nlp_2 = spacy.load('en_core_web_sm') ## expansion

In [7]:
def expand_contractions(text):
    
    flags = re.IGNORECASE | re.MULTILINE
    
    text = re.sub(r"\b(can)'?t\b", r'\g<1> not', text, flags = flags)
    text = re.sub(r"\b(can)(not)\b", r'\g<1> \g<2>', text, flags = flags)
    
    text = re.sub(r"\b(won)'?t\b", r'will not', text, flags = flags)

    return text

In [8]:
expand_contractions(example_text)

'I can not, can not, can not, will not, will not do it!'

In [9]:
class ContractionsComponent(object):
    name = "expand_contractions"

    nlp:Language

    def __init__(self, nlp:Language):
        self.nlp = nlp

    def __call__(self, doc:Doc) -> Doc:
        text = doc.text
        return self.nlp.make_doc(expand_contractions(text))

In [10]:
nlp_2.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1061ed080>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x105269e28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x105269e88>)]

In [11]:
nlp_2.add_pipe(ContractionsComponent(nlp_2), before = 'tagger')

In [12]:
nlp_2.pipeline

[('expand_contractions', <__main__.ContractionsComponent at 0x129844a20>),
 ('tagger', <spacy.pipeline.pipes.Tagger at 0x1061ed080>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x105269e28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x105269e88>)]

In [13]:
doc = nlp_2('I can\'t, cant, cannot, won\'t, wont do it!')

In [14]:
hash_lookup_2 = defaultdict(int)
for tk in doc:
    hash_lookup_2[tk.text] += 1

In [15]:
print(example_text)

I can't, cant, cannot, won't, wont do it!


%html

<h3>Results:</h3>

In [16]:
print('control group:', dict(hash_lookup_1))
print()
print('test group:', dict(hash_lookup_2))

control group: {'I': 1, 'ca': 2, "n't": 2, ',': 4, 'nt': 2, 'can': 1, 'not': 1, 'wo': 2, 'do': 1, 'it': 1, '!': 1}

test group: {'I': 1, 'can': 3, 'not': 5, ',': 4, 'will': 2, 'do': 1, 'it': 1, '!': 1}
