#### 텍스트 전처리

##### 1. 토큰화(Tokenization)

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### 1) 단어 토큰화

In [2]:
sample = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

In [3]:
sample

"Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

In [4]:
from nltk.tokenize import word_tokenize
print(word_tokenize(sample))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [5]:
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(sample))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [6]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence(sample))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [7]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(sample))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


##### 2)문장 토큰화

In [None]:
from nltk.tokenize import sent_tokenize
text="His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print(sent_tokenize(text))

In [16]:
from nltk.tokenize import sent_tokenize
text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally,\
the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. \
He looked about, to make sure no one was near."

In [17]:
text

'His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally,the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near.'

In [18]:
print(text)

His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally,the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near.


In [20]:
sent_tokenize(text)

['His barber kept his word.',
 'But keeping such a huge secret to himself was driving him crazy.',
 'Finally,the barber went up a mountain and almost to the edge of a cliff.',
 'He dug a hole in the midst of some reeds.',
 'He looked about, to make sure no one was near.']

In [21]:
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally,the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [22]:
text = "I'm actively looking for a Ph.D students. and you're a Ph.D student. But She isn't yet a Ph'D \
student, still in the master program."
print(sent_tokenize(text))

["I'm actively looking for a Ph.D students.", "and you're a Ph.D student.", "But She isn't yet a Ph'D student, still in the master program."]


#### 한글 문장 토큰화

In [23]:
import kss

In [24]:
text = "딥러닝 자연어 처리가 재미있기는 합니다. 그련데 문제는 영어보다 한국어로 할 때 너무 어려워요. \
농담 아니에요. 이제 해보시면 아실거에요."
print(text)

딥러닝 자연어 처리가 재미있기는 합니다. 그련데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담 아니에요. 이제 해보시면 아실거에요.


In [25]:
print(kss.split_sentences(text))

[Korean Sentence Splitter]: Initializing Pynori...


['딥러닝 자연어 처리가 재미있기는 합니다.', '그련데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담 아니에요.', '이제 해보시면 아실거에요.']


#### 3)품사(pos) 태깅

In [26]:
text = "I'm actively looking for a Ph.D students. and you're a Ph.D student. But She isn't yet a Ph'D \
student, still in the master program."
print(word_tokenize(text))

['I', "'m", 'actively', 'looking', 'for', 'a', 'Ph.D', 'students', '.', 'and', 'you', "'re", 'a', 'Ph.D', 'student', '.', 'But', 'She', 'is', "n't", 'yet', 'a', 'Ph', "'D", 'student', ',', 'still', 'in', 'the', 'master', 'program', '.']


In [27]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [28]:
from nltk.tag import pos_tag
x = word_tokenize(text)
pos_tag(x)

[('I', 'PRP'),
 ("'m", 'VBP'),
 ('actively', 'RB'),
 ('looking', 'VBG'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('Ph.D', 'JJ'),
 ('students', 'NNS'),
 ('.', '.'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ("'re", 'VBP'),
 ('a', 'DT'),
 ('Ph.D', 'NNP'),
 ('student', 'NN'),
 ('.', '.'),
 ('But', 'CC'),
 ('She', 'PRP'),
 ('is', 'VBZ'),
 ("n't", 'RB'),
 ('yet', 'RB'),
 ('a', 'DT'),
 ('Ph', 'NNP'),
 ("'D", 'MD'),
 ('student', 'VB'),
 (',', ','),
 ('still', 'RB'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('master', 'NN'),
 ('program', 'NN'),
 ('.', '.')]