# 텍스트 전처리

## 1. 토큰화(Tokenization)

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
sample = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

In [8]:
from nltk.tokenize import word_tokenize  
print(word_tokenize(sample))  

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [11]:
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(sample))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [12]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence(sample))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [13]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
text="Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 문장 토큰화

In [15]:
!pip install kss
import kss

text='딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?'
print(kss.split_sentences(text))

Collecting kss
  Downloading kss-3.1.0.4.tar.gz (42.3 MB)
[K     |████████████████████████████████| 42.3 MB 48 kB/s 
[?25hCollecting emoji
  Downloading emoji-1.4.2.tar.gz (184 kB)
[K     |████████████████████████████████| 184 kB 55.4 MB/s 
[?25hBuilding wheels for collected packages: kss, emoji
  Building wheel for kss (setup.py) ... [?25l[?25hdone
  Created wheel for kss: filename=kss-3.1.0.4-py3-none-any.whl size=42336591 sha256=99071154a7af55ee185c5740b3d2557e9f406fbab2d78fddd432eeef93ce58a0
  Stored in directory: /root/.cache/pip/wheels/94/d8/3c/b5f02f814e08c3e2f35e32ae2ac92a34c8412ed6f92ff470ce
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186469 sha256=5e5c214b172098cc7cecbaa8965b2ad1fa41c327dcb8362ba123f93e85b4bb85
  Stored in directory: /root/.cache/pip/wheels/e4/61/e7/2fc1ac8f306848fc66c6c013ab511f0a39ef4b1825b11363b2
Successfully built kss emoji
Installing collected packages: emoji, kss


[Korean Sentence Splitter]: Initializing Kss...


['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요.', '이제 해보면 알걸요?']


## 3) 품사(POS) 태깅

In [17]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [19]:
from nltk.tag import pos_tag
text="I am actively looking for Ph.D. students. and you are a Ph.D. student."
print(pos_tag(text))

[('I', 'PRP'), (' ', 'VBP'), ('a', 'DT'), ('m', 'NN'), (' ', 'VBZ'), ('a', 'DT'), ('c', 'JJ'), ('t', 'NN'), ('i', 'NN'), ('v', 'VBP'), ('e', 'NN'), ('l', 'NN'), ('y', 'NN'), (' ', 'NNP'), ('l', 'NN'), ('o', 'NN'), ('o', 'IN'), ('k', 'NN'), ('i', 'NN'), ('n', 'VBP'), ('g', 'NN'), (' ', 'NNP'), ('f', 'NN'), ('o', 'NN'), ('r', 'NN'), (' ', 'NNP'), ('P', 'NNP'), ('h', 'NN'), ('.', '.'), ('D', 'NNP'), ('.', '.'), (' ', 'VB'), ('s', 'JJ'), ('t', 'NN'), ('u', 'JJ'), ('d', 'NN'), ('e', 'NN'), ('n', 'JJ'), ('t', 'NN'), ('s', 'NN'), ('.', '.'), (' ', 'VB'), ('a', 'DT'), ('n', 'JJ'), ('d', 'NN'), (' ', 'NNP'), ('y', 'NN'), ('o', 'NN'), ('u', 'JJ'), (' ', 'VBZ'), ('a', 'DT'), ('r', 'NN'), ('e', 'NN'), (' ', 'VBZ'), ('a', 'DT'), (' ', 'JJ'), ('P', 'NNP'), ('h', 'NN'), ('.', '.'), ('D', 'NNP'), ('.', '.'), (' ', 'VB'), ('s', 'JJ'), ('t', 'NN'), ('u', 'JJ'), ('d', 'NN'), ('e', 'NN'), ('n', 'JJ'), ('t', 'NN'), ('.', '.')]


- 한글 (KoNLPy) : 자바로 개발된 파

In [20]:
! pip install Konlpy > /dev/null

Collecting Konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.6 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 59.6 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: JPype1, colorama, beautifulsoup4, Konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 Konlpy-0.5.2 beautifulsoup4-4.6.0 colorama-0.4.4


### Okt

In [22]:
from konlpy.tag import Okt
okt = Okt()
okt.morphs('열심히 코딩한 당신, 연휴에는 여행을 가봐요')  # 형태소 분석

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']

In [23]:
text = '열심히 코딩한 당신, 연휴에는 여행을 가봐요'
okt.pos(text)  # 품사 분석

[('열심히', 'Adverb'),
 ('코딩', 'Noun'),
 ('한', 'Josa'),
 ('당신', 'Noun'),
 (',', 'Punctuation'),
 ('연휴', 'Noun'),
 ('에는', 'Josa'),
 ('여행', 'Noun'),
 ('을', 'Josa'),
 ('가봐요', 'Verb')]

In [27]:
# 명사 추출
okt.nouns(text)

['코딩', '당신', '연휴', '여행']

### 꼬꼬마

In [28]:
from konlpy.tag import Kkma
kkma = Kkma()
kkma.morphs(text)

['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']

In [29]:
kkma.pos(text)

[('열심히', 'MAG'),
 ('코딩', 'NNG'),
 ('하', 'XSV'),
 ('ㄴ', 'ETD'),
 ('당신', 'NP'),
 (',', 'SP'),
 ('연휴', 'NNG'),
 ('에', 'JKM'),
 ('는', 'JX'),
 ('여행', 'NNG'),
 ('을', 'JKO'),
 ('가보', 'VV'),
 ('아요', 'EFN')]

In [30]:
kkma.nouns(text)

['코딩', '당신', '연휴', '여행']

## 2. 정제 및 정규화

In [31]:
import re
text = "I was wondering if anyone out there could enlighten me on this car."
shortword = re.compile(r'\W*\b\w{1,2}\b')
print(shortword.sub('', text))

 was wondering anyone out there could enlighten this car.


## 3.어간 추출(Stemming) 및 표제어 추출(Lemmatization)

### 1) 표제어 추출(Lemmatization)

In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [34]:
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([n.lemmatize(w) for w in words])

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [40]:
n.lemmatize('dies', 'v'), n.lemmatize('has','v'), n.lemmatize('going','v'), n.lemmatize('watched','v')

('die', 'have', 'go', 'watch')

### 2) 어간 추출(Stemming)

In [41]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
s = PorterStemmer()
text="This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
words=word_tokenize(text)
print(words)

['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']


In [43]:
print([s.stem(w) for w in words])

['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [44]:
words=['formalize', 'allowance', 'electricical']
print([s.stem(w) for w in words])

['formal', 'allow', 'electric']


In [45]:
s=PorterStemmer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([s.stem(w) for w in words])

['polici', 'do', 'organ', 'have', 'go', 'love', 'live', 'fli', 'die', 'watch', 'ha', 'start']


In [46]:
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([l.stem(w) for w in words])

['policy', 'doing', 'org', 'hav', 'going', 'lov', 'liv', 'fly', 'die', 'watch', 'has', 'start']


## 4. 불용어(Stopwords)

In [48]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [53]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 
stop_words.add("'s")

word_tokens = word_tokenize(example.lower())

result = []
for w in word_tokens: 
    if w not in stop_words: 
        result.append(w) 

print(word_tokens) 
print(result)

['family', 'is', 'not', 'an', 'important', 'thing', '.', 'it', "'s", 'everything', '.']
['family', 'important', 'thing', '.', 'everything', '.']


In [52]:
[w for w in word_tokens if w not in stop_words]

['family', 'important', 'thing', '.', "'s", 'everything', '.']

In [54]:
example = "고기를 아무렇게나 구우려고 하면 안 돼. 고기라고 다 같은 게 아니거든. 예컨대 삼겹살을 구울 때는 중요한 게 있지."
stop_words = "아무거나 아무렇게나 어찌하든지 같다 비슷하다 예컨대 이럴정도로 하면 아니거든"
# 위의 불용어는 명사가 아닌 단어 중에서 저자가 임의로 선정한 것으로 실제 의미있는 선정 기준이 아님
stop_words=stop_words.split(' ')
word_tokens = word_tokenize(example)

result = [] 
for w in word_tokens: 
    if w not in stop_words: 
        result.append(w) 
# 위의 4줄은 아래의 한 줄로 대체 가능
# result=[word for word in word_tokens if not word in stop_words]

print(word_tokens) 
print(result)

['고기를', '아무렇게나', '구우려고', '하면', '안', '돼', '.', '고기라고', '다', '같은', '게', '아니거든', '.', '예컨대', '삼겹살을', '구울', '때는', '중요한', '게', '있지', '.']
['고기를', '구우려고', '안', '돼', '.', '고기라고', '다', '같은', '게', '.', '삼겹살을', '구울', '때는', '중요한', '게', '있지', '.']


In [55]:
st = '아무거나 아무렇게나 같다'.split()
st

['아무거나', '아무렇게나', '같다']