### Tokenization test

This is the test how the different tokenization library work differently

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [5]:
import nltk
nltk.download('punkt') ## download punkt package

[nltk_data] Downloading package punkt to /Users/minseokoh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Word Tokenization

In [11]:
# NLTK word_tokenizer

In [14]:
print('word tokenization1: ', word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is a cheery as cheery goes for a pastry shop."))

word tokenization1:  ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'a', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


Separate Do, n't & Jone, 's

In [10]:
# NLTK WokdPunctTokenizer

In [15]:
print('word tokenization2: ', WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is a cheery as cheery goes for a pastry shop."))

word tokenization2:  ['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'a', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [9]:
## Keras text_to_word_sequence

In [16]:
print('word tokenization3: ', text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is a cheery as cheery goes for a pastry shop."))

word tokenization3:  ["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'a', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


#### Standard example of tokenization

In [13]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

text = "Starting a home-based restaurant may be an ideal. it does't have a food chain or restaurant of their own."
print('TreebankWordTokenizer : ', tokenizer.tokenize(text))

TreebankWordTokenizer :  ['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', "does't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


#### Sentence tokenization

In [20]:
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff."
print("Sentence tokenization1: ", sent_tokenize(text))

Sentence tokenization1:  ['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.']


In [21]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
print("Sentence tokenization2: ", sent_tokenize(text))

Sentence tokenization2:  ['I am actively looking for Ph.D. students.', 'and you are a Ph.D. student.']


#### Korean Sentence Tokenization

In [25]:
## pip install kss

In [24]:
## import kss

#### Part-of-speech tagging

In [28]:
from nltk.tag import pos_tag

text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
tokenized_sentence = word_tokenize(text)

print("Word tokenize: ", tokenized_sentence)
print("Tagging: ", pos_tag(tokenized_sentence))

Word tokenize:  ['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']
Tagging:  [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]


In [35]:
## VBP verb, RB adverb, VBG current adverb, IN preposition, 
## NNP proper noun, NNS plural noun, CC conjunction, DT article 

### Morpheme tokenization

In [30]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading JPype1-1.5.0-cp311-cp311-macosx_10_9_universal2.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.4/587.4 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [32]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

### Okt, Mecab, Komoran, Hannanum, Kkma

okt = Okt()
kkma = Kkma()

print(okt.morphs("열심히 일한 당신 떠나라"))
print(okt.pos("열심히 일한 당신 떠나라"))
print(okt.nouns("열심히 일한 당신 떠나라"))

JVMNotFoundException: No JVM shared library file (libjli.dylib) found. Try setting up the JAVA_HOME environment variable properly.

In [33]:
print(kkma.morphs("열심히 일한 당신 떠나라"))
print(kkma.pos("열심히 일한 당신 떠나라"))
print(kkma.nouns("열심히 일한 당신 떠나라"))

NameError: name 'kkma' is not defined