In [None]:
# default_exp nlp

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
import os
import spacy

# NLP

This module provides tools for working with text data. The `nlp` object loads a Spacy tokenizer.

In [None]:
# export
nlp = spacy.load('en_core_web_sm')

In [None]:
# export
def tokenize(raw, lower=True):
    """Tokenize a string of text.

    Parameters
    -----------
    raw: str
        The text to tokenize.
    lower: bool
        If True, lowercase the input text.

    Returns
    --------
    list[str]
    """
    if lower: raw = raw.lower()
    return [t.text for t in nlp(raw, disable=['tagger', 'parser', 'ner'])]

`tokenize` uses the spacy tokenizer to separate a string into word tokens.

In [None]:
sentence1 = 'My dog and I walked to the park after work.'
sentence2 = 'My dog and my cat walked to the park to get exercise and play.'

In [None]:
tokenize(sentence1)

['my', 'dog', 'and', 'i', 'walked', 'to', 'the', 'park', 'after', 'work', '.']

In [None]:
tokenize('')

[]

In [None]:
# export
def lexical_density(text, lower=True):
    """Compute lexical density of a piece of text.

    https://en.wikipedia.org/wiki/Lexical_density

    Parameters
    -----------
    text: str or list[str]
    lower: bool (see `tokenize` docs)

    Returns
    --------
    float: Number between 0 and 1, where larger values indicate a higher
        lexical density.
    """
    if isinstance(text, str):
        text = tokenize(text, lower)
    if not text:
        return 0.0
    return len(set(text)) / len(text)

`lexical density` works on either strings or lists of strings containing word tokens.

In [None]:
print(sentence1)
lexical_density(sentence1)

My dog and I walked to the park after work.


1.0

Sentence 2 has multiple occurrences of "my" and "and" so its lexical density is lower.

In [None]:
print(sentence2)
lexical_density(sentence2)

My dog and my cat walked to the park to get exercise and play.


0.8

Passing in a tokenized sentence returns the same results.

In [None]:
tokens = tokenize(sentence2)
print(tokens)
lexical_density(tokens)

['my', 'dog', 'and', 'my', 'cat', 'walked', 'to', 'the', 'park', 'to', 'get', 'exercise', 'and', 'play', '.']


0.8