In [1]:
import re
from collections import defaultdict

In [2]:
example_text = 'I can\'t, cant, cannot, won\'t, wont do it!'

In [3]:
import spacy

from spacy.tokens import Doc
from spacy.language import Language

%html

<h3>Control Group:</h3>

In [4]:
nlp_1 = spacy.load('en_core_web_sm') ## no expansion, control

In [5]:
left_only = nlp_1(example_text)

hash_lookup_1 = defaultdict(int)
for tk in left_only:
    hash_lookup_1[tk.text] += 1

%html

<h3>Test Group:</h3>

In [6]:
nlp_2 = spacy.load('en_core_web_sm') ## expansion

In [7]:
def expand_contractions(text: str) -> str:
    
    ## https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
    
    flags = re.IGNORECASE | re.MULTILINE
    
    text = re.sub(r'`', "'", text, flags = flags)
    
    ## starts / ends with '
    text = re.sub(
        r"(\s|^)'(aight|cause)(\s|$)",
        '\g<1>\g<2>\g<3>',
        text, flags = flags
    )
    
    text = re.sub(
        r"(\s|^)'t(was|is)(\s|$)", r'\g<1>it \g<2>\g<3>',
        text,
        flags = flags
    )
    
    text = re.sub(
        r"(\s|^)ol'(\s|$)",
        '\g<1>old\g<2>',
        text, flags = flags
    )
    
    ## expand words without '
    text = re.sub(r"\b(aight)\b", 'alright', text, flags = flags)
    text = re.sub(r'\bcause\b', 'because', text, flags = flags)
    text = re.sub(r'\b(finna|gonna)\b', 'going to', text, flags = flags)
    text = re.sub(r'\bgimme\b', 'give me', text, flags = flags)
    text = re.sub(r"\bgive'n\b", 'given', text, flags = flags)
    text = re.sub(r"\bhowdy\b", 'how do you do', text, flags = flags)
    text = re.sub(r"\bgotta\b", 'got to', text, flags = flags)
    text = re.sub(r"\binnit\b", 'is it not', text, flags = flags)
    text = re.sub(r"\b(can)(not)\b", r'\g<1> \g<2>', text, flags = flags)
    text = re.sub(r"\bwanna\b", 'want to', text, flags = flags)
    text = re.sub(r"\bmethinks\b", 'me thinks', text, flags = flags)
    
    ## one offs,
    text = re.sub(r"\bo'er\b", r'over', text, flags = flags)
    text = re.sub(r"\bne'er\b", r'never', text, flags = flags)
    text = re.sub(r"\bo'?clock\b", 'of the clock', text, flags = flags)
    text = re.sub(r"\bma'am\b", 'madam', text, flags = flags)
    text = re.sub(r"\bgiv'n\b", 'given', text, flags = flags)
    text = re.sub(r"\be'er\b", 'ever', text, flags = flags)
    text = re.sub(r"\bd'ye\b", 'do you', text, flags = flags)
    text = re.sub(r"\be'er\b", 'ever', text, flags = flags)
    text = re.sub(r"\bd'ye\b", 'do you', text, flags = flags)
    text = re.sub(r"\bg'?day\b", 'good day', text, flags = flags)
    text = re.sub(r"\b(ain|amn)'?t\b", 'am not', text, flags = flags)
    text = re.sub(r"\b(are|can)'?t\b", r'\g<1> not', text, flags = flags)
    text = re.sub(r"\b(let)'?s\b", r'\g<1> us', text, flags = flags)
    
    ## major expansions involving smaller,
    text = re.sub(r"\by'all'dn't've'd\b", 'you all would not have had', text, flags = flags)
    text = re.sub(r"\by'all're\b", 'you all are', text, flags = flags)
    text = re.sub(r"\by'all'd've\b", 'you all would have', text, flags = flags)
    text = re.sub(r"(\s)y'all(\s)", r'\g<1>you all\g<2>', text, flags = flags)
    
    ## minor,
    text = re.sub(r"\b(won)'?t\b", 'will not', text, flags = flags)
    text = re.sub(r"\bhe'd\b", 'he had', text, flags = flags)

    ## major,
    text = re.sub(r"\b(I|we|who)'?d'?ve\b", r'\g<1> would have', text, flags = flags)
    text = re.sub(r"\b(could|would|must|should|would)n'?t'?ve\b", r'\g<1> not have', text, flags = flags)
    text = re.sub(r"\b(he)'?dn'?t'?ve'?d\b", r'\g<1> would not have had', text, flags = flags)
    text = re.sub(r"\b(daren|daresn|dasn)'?t", 'dare not', text, flags = flags)
    text = re.sub(r"\b(he|how|i|it|she|that|there|these|they|we|what|where|which|who|you)'?ll\b", r'\g<1> will', text, flags = flags)
    text = re.sub(r"\b(everybody|everyone|he|how|it|she|somebody|someone|something|that|there|this|what|when|where|which|who|why)'?s\b", r'\g<1> is', text, flags = flags)
    text = re.sub(r"\b(I)'?m'a\b", r'\g<1> am about to', text, flags = flags)
    text = re.sub(r"\b(I)'?m'o\b", r'\g<1> am going to', text, flags = flags)
    text = re.sub(r"\b(I)'?m\b", r'\g<1> am', text, flags = flags)
    text = re.sub(r"\bshan't\b", 'shall not', text, flags = flags)
    text = re.sub(r"\b(are|could|did|does|do|go|had|has|have|is|may|might|must|need|ought|shall|should|was|were|would)n'?t\b", r'\g<1> not', text, flags = flags)
    text = re.sub(r"\b(could|had|he|i|may|might|must|should|these|they|those|to|we|what|where|which|who|would|you)'?ve\b", r'\g<1> have', text, flags = flags)
    text = re.sub(r"\b(how|so|that|there|these|they|those|we|what|where|which|who|why|you)'?re\b", r'\g<1> are', text, flags = flags)
    text = re.sub(r"\b(I|it|she|that|there|they|we|which|you)'?d\b", r'\g<1> had', text, flags = flags)
    text = re.sub(r"\b(how|what|where|who|why)'?d\b", r'\g<1> did', text, flags = flags)
    
    return text

In [8]:
test_text = """
 'aight = alright
 ain't = am not
 amn't = am not
 aren't = are not
 can't = can not
 'cause = because
 could've = could have
 couldn't = could not
 couldn't've = could not have
 daren't = dare not
 daresn't = dare not
 dasn't = dare not
 didn't = did not
 doesn't = does not
 don't = do not
 d'ye = do you
 e'er = ever
 everybody's = everybody is
 everyone's = everyone is
 finna = going to
 g'day = good day
 gimme = give me
 giv'n = given
 gonna = going to
 gon't = go not
 gotta = got to
 hadn't = had not
 had've = had have
 hasn't = has not
 haven't = have not
 he'd = he had
 he'dn't've'd = he would not have had
 he'll = he will
 he's = he is
 he've = he have
 how'd = how did
 howdy = how do you do
 how'll = how will
 how're = how are
 how's = how is
 I'd = I had
 I'd've = I would have
 I'll = I will
 I'm = I am
 I'm'a = I am about to
 I'm'o = I am going to
 innit = is it not
 I've = I have
 isn't = is not
 it'd = it had
 it'll = it will
 it's = it is
 let's = let us
 ma'am = madam
 mayn't = may not
 may've = may have
 methinks = me thinks
 mightn't = might not
 might've = might have
 mustn't = must not
 mustn't've = must not have
 must've = must have
 needn't = need not
 ne'er = never
 o'clock = of the clock
 o'er = over
 ol' = old
 oughtn't = ought not
 shalln't = shall not
 shan't = shall not
 she'd = she had
 she'll = she will
 she's = she is
 should've = should have
 shouldn't = should not
 shouldn't've = should not have
 somebody's = somebody is
 someone's = someone is
 something's = something is
 so're = so are
 that'll = that will
 that're = that are
 that's = that is
 that'd = that had
 there'd = there had
 there'll = there will
 there're = there are
 there's = there is
 these're = these are
 these've = these have
 they'd = they had
 they'll = they will
 they're = they are
 they've = they have
 this's = this is
 those're = those are
 those've = those have
 'tis = it is
 to've = to have
 'twas = it was
 wanna = want to
 wasn't = was not
 we'd = we had
 we'd've = we would have
 we'll = we will
 we're = we are
 we've = we have
 weren't = were not
 what'd = what did
 what'll = what will
 what're = what are
 what's = what is
 what've = what have
 when's = when is
 where'd = where did
 where'll = where will
 where're = where are
 where's = where is
 where've = where have
 which'd = which had
 which'll = which will
 which're = which are
 which's = which is
 which've = which have
 who'd = who did
 who'd've = who would have
 who'll = who will
 who're = who are
 who's = who is
 who've = who have
 why'd = why did
 why're = why are
 why's = why is
 won't = will not
 would've = would have
 wouldn't = would not
 wouldn't've = would not have
 y'all = you all
 y'all'd've = you all would have
 y'all'dn't've'd = you all would not have had
 y'all're = you all are
 you'd = you had
 you'll = you will
 you're = you are
 you've = you have
"""

In [9]:
rows = expand_contractions(test_text).split('\n')

for row in ( row for row in rows if row != ''):
    left, right = list(map(lambda r: r.strip(), row.split(' = ')))
    
    assert left == right, f"{left} = {right}"

In [10]:
expand_contractions(example_text)

'I can not, can not, can not, will not, will not do it!'

In [11]:
class ExpandContractionsComponent(object):
    name = "expand_contractions"

    nlp: Language

    def __init__(self, nlp: Language):
        self.nlp = nlp

    def __call__(self, doc: Doc) -> Doc:
        text = doc.text
        return self.nlp.make_doc(expand_contractions(text))

In [12]:
nlp_2.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x106e16320>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x105e8ee28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x105e8ee88>)]

In [13]:
nlp_2.add_pipe(
    ExpandContractionsComponent(nlp_2),
    before = 'tagger'
)

In [14]:
nlp_2.pipeline

[('expand_contractions',
  <__main__.ExpandContractionsComponent at 0x1259eb8d0>),
 ('tagger', <spacy.pipeline.pipes.Tagger at 0x106e16320>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x105e8ee28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x105e8ee88>)]

In [15]:
doc = nlp_2('I can\'t, cant, cannot, won\'t, wont do it!')

In [16]:
hash_lookup_2 = defaultdict(int)
for tk in doc:
    hash_lookup_2[tk.text] += 1

In [17]:
print(example_text)

I can't, cant, cannot, won't, wont do it!


%html

<h3>Results:</h3>

In [18]:
## control group,
print(dict(hash_lookup_1))

{'I': 1, 'ca': 2, "n't": 2, ',': 4, 'nt': 2, 'can': 1, 'not': 1, 'wo': 2, 'do': 1, 'it': 1, '!': 1}


In [19]:
## test group,
print(dict(hash_lookup_2))

{'I': 1, 'can': 3, 'not': 5, ',': 4, 'will': 2, 'do': 1, 'it': 1, '!': 1}
