#maximum matching word segmentation algorithm

### Tokenization using Python Split

In [1]:
# Tokenization using Python Split

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways, Tokenization is essential in NLP tasks.'''

Tokens = Mystr.split()
print(Tokens)

['This', 'is', 'a', 'tokenization', 'tutorial.', 'We', 'are', 'learning', 'different', 'tokenization', 'methods,', 'and', 'ways,', 'Tokenization', 'is', 'essential', 'in', 'NLP', 'tasks.']


In [2]:
# Sentence Tokenization using python split

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

Tokens = Mystr.split('.')
print(Tokens)

['This is a tokenization tutorial', ' We are learning different tokenization methods,\nand ways? Tokenization is essential in NLP tasks', '']


<strong>Problem:</strong> 
it splits after the last “.” as well. And it doesn’t consider the “?” as an indicator of next sentence because it only takes one character, which is “.”.

### Tokenization using Regex (Regular Expression)

In [3]:
import re

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

Tokens = re.findall("\w+", Mystr) #finds all the words in a sentence by removing any special characters
print(Tokens) 

['This', 'is', 'a', 'tokenization', 'tutorial', 'We', 'are', 'learning', 'different', 'tokenization', 'methods', 'and', 'ways', 'Tokenization', 'is', 'essential', 'in', 'NLP', 'tasks']


In [4]:
# Sentence Tokenization

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

Tokens = re.compile('[.!?\n]' ).split(Mystr) #compile and split sentence 
print(Tokens)

['This is a tokenization tutorial', ' We are learning different tokenization methods,', 'and ways', ' Tokenization is essential in NLP tasks', '']


### NLTK Tokenization

In [5]:
from nltk.tokenize import word_tokenize

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

print(word_tokenize(Mystr))

['This', 'is', 'a', 'tokenization', 'tutorial', '.', 'We', 'are', 'learning', 'different', 'tokenization', 'methods', ',', 'and', 'ways', '?', 'Tokenization', 'is', 'essential', 'in', 'NLP', 'tasks', '.']


In [6]:
from nltk.tokenize import sent_tokenize

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

print(sent_tokenize(Mystr))

['This is a tokenization tutorial.', 'We are learning different tokenization methods,\nand ways?', 'Tokenization is essential in NLP tasks.']


### Spacy Tokenization

In [7]:
from spacy.lang.en import English

nlp = English()
Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''
my_doc = nlp(Mystr)
Tokens = []
for token in my_doc:
    Tokens.append(token.text)
    
print(Tokens)

['This', 'is', 'a', 'tokenization', 'tutorial', '.', 'We', 'are', 'learning', 'different', 'tokenization', 'methods', ',', '\n', 'and', 'ways', '?', 'Tokenization', 'is', 'essential', 'in', 'NLP', 'tasks', '.']


In [8]:
import spacy 
  
#load core english library 
nlp = spacy.load("en_core_web_sm") 

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

#take unicode string   
#here u stands for unicode 
doc = nlp(Mystr) 
#to print sentences 
for sent in doc.sents: 
    print(sent)

This is a tokenization tutorial.
We are learning different tokenization methods,
and ways?
Tokenization is essential in NLP tasks.


### Keras Tokenization

In [9]:
from keras.preprocessing.text import text_to_word_sequence

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''
Tokens = text_to_word_sequence(Mystr)
print(Tokens)

['this', 'is', 'a', 'tokenization', 'tutorial', 'we', 'are', 'learning', 'different', 'tokenization', 'methods', 'and', 'ways', 'tokenization', 'is', 'essential', 'in', 'nlp', 'tasks']


### Gensim Tokenizer

In [10]:
from gensim.utils import tokenize

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

print(list(tokenize(Mystr)))

['This', 'is', 'a', 'tokenization', 'tutorial', 'We', 'are', 'learning', 'different', 'tokenization', 'methods', 'and', 'ways', 'Tokenization', 'is', 'essential', 'in', 'NLP', 'tasks']


In [11]:
from gensim.summarization.textcleaner import split_sentences

Mystr = '''This is a tokenization tutorial. We are learning different tokenization methods,
and ways? Tokenization is essential in NLP tasks.'''

Tokens = split_sentences(Mystr)
print(Tokens)

['This is a tokenization tutorial.', 'We are learning different tokenization methods,', 'and ways?', 'Tokenization is essential in NLP tasks.']
