# NLTK

In [1]:
#! pip install nltk



In [1]:
import nltk

In [None]:
#nltk.download('all')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


## Corpus
- NLTK 내에 있는 코퍼스를 다룰 수 있음.

In [3]:
from nltk.corpus import *

In [4]:
files = gutenberg.fileids()

In [5]:
print(files)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [6]:
corpus1 = gutenberg.open("austen-emma.txt").read()

In [8]:
corpus2 = gutenberg.raw("austen-sense.txt")

In [9]:
type(corpus1)

str

## Tokenization
- 자연어 분석을 위해 우선 긴 문자열을 작은 단위(토큰, Token)로 쪼개는 작업
* 문장 단위 -> sentence tokenize
* 단어 단위 -> word tokenize

In [10]:
text = "Hello everyone! Roday we will learn Python. Shall we start?"

In [11]:
from nltk.tokenize import sent_tokenize #문장단위 분할

In [12]:
print(sent_tokenize(text))

['Hello everyone!', 'Roday we will learn Python.', 'Shall we start?']


In [13]:
from nltk.tokenize import word_tokenize #단어단위 분할

In [14]:
print(word_tokenize(text))

['Hello', 'everyone', '!', 'Roday', 'we', 'will', 'learn', 'Python', '.', 'Shall', 'we', 'start', '?']


#### 다양한 Tokenization

In [15]:
text = "He's my father-in-law"

In [17]:
from nltk.tokenize import TreebankWordTokenizer #Penn Treebank 기반. 하이픈 분리 X
tbt = TreebankWordTokenizer()
tbt.tokenize(text)

['He', "'s", 'my', 'father-in-law']

In [19]:
from nltk.tokenize import WordPunctTokenizer #모든 구두점으로 분리
tbt = WordPunctTokenizer()
tbt.tokenize(text)

['He', "'", 's', 'my', 'father', '-', 'in', '-', 'law']

In [21]:
from nltk.tokenize import RegexpTokenizer #정규표현식 이용해 분리
tbt = RegexpTokenizer("[\w']+")
tbt.tokenize(text)

["He's", 'my', 'father', 'in', 'law']

In [22]:
from nltk.tokenize import WhitespaceTokenizer #공백을 이용해 분리
tbt = WhitespaceTokenizer()
tbt.tokenize(text)

["He's", 'my', 'father-in-law']

### 형태소: 의미를 갖는 가장 작은 단위
### 형태소 분석
* 어간 추출
* 원형 복원
* 품사 태깅

## Stemming & Lemmatizing

### Stemming: 어간 추출
- 변화된 단어의 접미사나 어미를 제거하여, 같은 의미를 가지는 형태소의 기본형을 찾는 작업
* Porterstemmer
* LancasterStemmer
* RegexpStemmer
* SnowballStemmer (German)

In [23]:
words = ['wait', 'waiting', 'waited', 'waits']
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for word in words:
    word_stemmed = ps.stem(word)
    print(word_stemmed)

wait
wait
wait
wait


In [25]:
sent = "Apple Watch Series 6 improves performance through redesigned hargware that packs even more features and power into the same impossively small design."
from nltk.stem import PorterStemmer
ps = PorterStemmer()
sent_stemmed = ""
words = word_tokenize(sent)
for word in words:
    word_stemmed = ps.stem(word)
    sent_stemmed += word_stemmed + " "
print(sent_stemmed)

appl watch seri 6 improv perform through redesign hargwar that pack even more featur and power into the same imposs small design . 


### Lemmatizing: 원형 복원
- 같은 의미를 가지는 여러 단어를 사전형(Lemma)으로 통일하는 작업
* WordNetLemmatizer

In [27]:
words = ['studying', 'studies', 'crying', 'cries']
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for word in words:
    word_lemmatized = wnl.lemmatize(word)
    print(word_lemmatized)

studying
study
cry
cry


In [28]:
sent = "Apple Watch Series 6 improves performance through redesigned hargware that packs even more features and power into the same impossively small design."
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
sent_lemmatized = ""
words = word_tokenize(sent)
for word in words:
    word_lemmatized = wnl.lemmatize(word)
    sent_lemmatized += word_lemmatized + " "
print(sent_lemmatized)

Apple Watch Series 6 improves performance through redesigned hargware that pack even more feature and power into the same impossively small design . 


## POS Tagging
- 낱말을 문법적인 기능이나 형태, 뜻에 따라 구분
- 품사의 구분은 언어/연구자/분석기마다 다름(NLTK는 Penn Treebank Tagset 이용)
- 태그셋에 대한 자세한 설명 확인 가능

In [29]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [30]:
nltk.help.upenn_tagset("VB")

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [34]:
from nltk import pos_tag
sent = "Emma refused to permit us to obtain the permit"
words = word_tokenize(sent)
words_pos = pos_tag(words)
print(words_pos)

[('Emma', 'NNP'), ('refused', 'VBD'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('permit', 'NN')]


In [37]:
from nltk import pos_tag
sent = "Emma refused to permit us to obtain the permit"
words = word_tokenize(sent)
words_pos = pos_tag(words)
for word_pos in words_pos:
    if "VB" in word_pos:
        print(word_pos)

('permit', 'VB')
('obtain', 'VB')


### Stopwords: 불용어 제거
- 자주 등장하지만 문장 분석에는 큰 도움이 되지 않는 단어들
- 불용어 리스트를 만들어 사용하는 경우가 많음

In [40]:
from nltk.corpus import stopwords
sent = "I looked in the mirror every morning and asked myself"
stop_words = stopwords.words('english')
words = word_tokenize(sent)
result = []
for word in words:
    if word not in stop_words:
        result.append(word)
print(result)

['I', 'looked', 'mirror', 'every', 'morning', 'asked']
