# Test Tokenizer
   
This notebook is to show examples of how to use the `tokenizer`

## Add Lexos to the Jupyter `sys.path`

In [1]:
%run jupyter_local_setup.py ../../../lexos

System path set to `../../../lexos`.


## Import Lexos Modules

In [2]:
from lexos import tokenizer
from lexos.io.smart import Loader

## Load Data

In [3]:
data = "../test_data/txt/Austen_Pride.txt"
loader = Loader()
loader.load(data)

## Making a Doc

In [4]:
doc = tokenizer.make_doc(loader.texts[0])
#by default the language model used is "xx_sent_ud_sm" which is a multilanguage model

In [5]:
#You can specify which language model to use when tokenizing
doc = tokenizer.make_doc(loader.texts[0],model="en_core_web_sm")

In [6]:
#Multiple texts can be tokenized in one call using tokenizer.make_docs. Reuturns a list of docs
docs = tokenizer.make_docs(loader.texts)

## Disabling or Excluding Components
Tokenizing using a language model can take a relatively long time. To increase efficiency you can disable/exclude components you do not intend to use. Disabled components will be loaded but unused, and excluded components will not be loaded.

In [None]:
doc = tokenizer.make_doc(loader.texts[0],model="en_core_web_sm", disable=["tagger","parser"])
#doc = tokenizer.make_doc(loader.texts[0],model="en_core_web_sm", exclude=["tagger","parser"])

## Stop Words
Stop words can be added or removed with `add_stopwords` and `remove_stopwords`

In [None]:
text = "This is an example string to test the tokenizer"
txtDoc = tokenizer.make_doc(
    text,
    add_stopwords=["an", "the", "is"]
)
for token in txtDoc:
    print(token.text, token.is_stop)

#language models will usually have default stop words. remove_stopwords is used to remove unwanted
#stopwords from the list marked as default by a language model.
text = "This is an example string to test the tokenizer"
txtDoc = tokenizer.make_doc(
    text,
    model = "en_core_web_sm",
    remove_stopwords=["is","the"]
)
print("\nRemove:")
for token in txtDoc:
    print(token.text, token.is_stop)

## Generating Word Ngrams

In [None]:
text = "This is an example string to test the tokenizer component"
doc = tokenizer.make_doc(text)
ngrams = tokenizer.ngrams_from_doc(doc, size=2)
for ngram in ngrams:
    print(ngram)

In [9]:
# an alternative method to create word ngrams is to use textacy directly. This method has additional options
# documented here: https://textacy.readthedocs.io/en/latest/api_reference/extract.html#textacy.extract.basics.ngrams
from textacy.extract.basics import ngrams as ng
text = "The end is nigh."
doc = tokenizer.make_doc(text)
ngrams = list(ng(doc, 2, min_freq=1))

## Generating Docs From Ngrams
ngrams_from_doc generates a list of ngrams from a doc. If you want to use the ngrams as a doc you will need to generate a new doc.

In [None]:
nDoc = tokenizer.doc_from_ngrams(ngrams, strict = True, model ="en_core_web_sm")
for token in nDoc:
    print(token.text)

## Generating Character Ngrams
Character ngrams are generated from untokenized text

In [None]:
text = "This is an example string to test the tokenizer"
chNgrams = tokenizer.generate_character_ngrams(text, 2, drop_whitespace=False)
for ngram in chNgrams:
    print(ngram)