# _NLTK 자연어 처리 패키지_

## 말뭉치

In [1]:
import nltk

In [2]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/quartz/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
nltk.download("gutenberg")
nltk.download('punkt')
nltk.download('reuters')
nltk.download("stopwords")
nltk.download("webtext")
nltk.download("wordnet")

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/quartz/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /Users/quartz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /Users/quartz/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/quartz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package webtext to /Users/quartz/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package wordnet to /Users/quartz/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(emma_raw[:1000])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had been supplied
by an excellent woman as governess, who had fallen little short
of a mother in affection.

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.  Between _them_ it was more the intimacy
of sisters.  Even before Miss Taylor had ceased to hold the nominal
office of governess, the mildness o

## 토큰 생성

In [7]:
from nltk.tokenize import sent_tokenize

In [19]:
print(sent_tokenize(emma_raw[:1000])[3])

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.


In [20]:
from nltk.tokenize import word_tokenize
print(word_tokenize(emma_raw[:1000]))

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.', 'She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.', 'Her', 'mother', 'had', 'died', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembrance', 'of', 'her', 'caresses', ';', 'and', 'her', 'place', 'had', 'been', 'supplied', 'by', 'an'

## 형태소 분석
- - - 

- 어간 추출(stemming)
- 원형 복원(lemmatizing)

In [None]:
# 어간 추출

In [2]:
words = ['lives', 'dies', 'flies', 'died']

In [3]:
from nltk.stem import *
ps = PorterStemmer()
ls = LancasterStemmer()

In [8]:
print("Words : ", words)
print("PorterStemmer : ", [ps.stem(word) for word in words])
print("LancasterStemmer : ", [ls.stem(word) for word in words])

Words :  ['lives', 'dies', 'flies', 'died']
PorterStemmer :  ['live', 'die', 'fli', 'die']
LancasterStemmer :  ['liv', 'die', 'fli', 'died']


In [None]:
# 원형 복원

In [9]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

In [16]:
[lm.lemmatize(word, pos='v') for word in words]

['live', 'die', 'fly', 'die']

## POS tagging

In [20]:
from nltk.tag import pos_tag
x = ["volumne", "I", "chapter", "1", "I", "am", "a", "boy", "."]
tagged_list = pos_tag(x)
tagged_list

[('volumne', 'NN'),
 ('I', 'PRP'),
 ('chapter', 'VBP'),
 ('1', 'CD'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('a', 'DT'),
 ('boy', 'NN'),
 ('.', '.')]

In [21]:
from nltk.tag import untag
untag(tagged_list)

['volumne', 'I', 'chapter', '1', 'I', 'am', 'a', 'boy', '.']

# _KoNLPy 한국어 처리 패키지_

In [21]:
from konlpy.corpus import kolaw

In [22]:
kolaw.fileids()

['constitution.txt']

In [23]:
c = kolaw.open('constitution.txt').read()

In [24]:
print(c[:66])

대한민국헌법

유구한 역사와 전통에 빛나는 우리 대한국민은 3·1운동으로 건립된 대한민국임시정부의 법통과 불의에 항거한


In [25]:
from konlpy.corpus import kobill

In [26]:
kobill.fileids()

['1809896.txt',
 '1809897.txt',
 '1809895.txt',
 '1809894.txt',
 '1809890.txt',
 '1809891.txt',
 '1809893.txt',
 '1809892.txt',
 '1809899.txt',
 '1809898.txt']

In [28]:
d = kobill.open('1809896.txt').read()
print(d[:100])

행정절차법 일부개정법률안

(유선호의원 대표발의 )

 의 안
 번 호

9896

발의연월일 : 2010.  11.  15.

발  의  자 : 유선호․강기갑․김효석  

최문순


## 형태소 분석

In [29]:
from konlpy.tag import *

In [31]:
kkma = Kkma()
hannanum = Hannanum()
twitter = Twitter()

### 꼬꼬마

In [32]:
kkma.nouns(c[:66])

['대한',
 '대한민국',
 '대한민국헌법',
 '민국',
 '헌법',
 '유구',
 '역사',
 '전통',
 '우리',
 '국민',
 '3',
 '1',
 '1운동',
 '운동',
 '건립',
 '대한민국임시정부',
 '임시',
 '정부',
 '법통',
 '불의',
 '항거']

In [33]:
kkma.morphs(c[:66])

['대한민국',
 '헌법',
 '유구',
 '하',
 'ㄴ',
 '역사',
 '와',
 '전통',
 '에',
 '빛나',
 '는',
 '우리',
 '대하',
 'ㄴ',
 '국민',
 '은',
 '3',
 '·',
 '1',
 '운동',
 '으로',
 '건립',
 '되',
 'ㄴ',
 '대한민국',
 '임시',
 '정부',
 '의',
 '법통',
 '과',
 '불의',
 '에',
 '항거',
 '하',
 'ㄴ']

In [34]:
kkma.pos(c[:66])

[('대한민국', 'NNG'),
 ('헌법', 'NNG'),
 ('유구', 'NNG'),
 ('하', 'XSV'),
 ('ㄴ', 'ETD'),
 ('역사', 'NNG'),
 ('와', 'JC'),
 ('전통', 'NNG'),
 ('에', 'JKM'),
 ('빛나', 'VV'),
 ('는', 'ETD'),
 ('우리', 'NNM'),
 ('대하', 'VV'),
 ('ㄴ', 'ETD'),
 ('국민', 'NNG'),
 ('은', 'JX'),
 ('3', 'NR'),
 ('·', 'SP'),
 ('1', 'NR'),
 ('운동', 'NNG'),
 ('으로', 'JKM'),
 ('건립', 'NNG'),
 ('되', 'XSV'),
 ('ㄴ', 'ETD'),
 ('대한민국', 'NNG'),
 ('임시', 'NNG'),
 ('정부', 'NNG'),
 ('의', 'JKG'),
 ('법통', 'NNG'),
 ('과', 'JC'),
 ('불의', 'NNG'),
 ('에', 'JKM'),
 ('항거', 'NNG'),
 ('하', 'XSV'),
 ('ㄴ', 'ETD')]

### hannanum

In [35]:
hannanum.nouns(c[:66])

['대한민국헌법',
 '유구',
 '역사',
 '전통',
 '빛',
 '우리',
 '대한국민',
 '3·1운동',
 '건립',
 '대한민국임시정부',
 '법통',
 '불의',
 '항거한']

In [36]:
hannanum.morphs(c[:66])

['대한민국헌법',
 '유구',
 '하',
 'ㄴ',
 '역사',
 '와',
 '전통',
 '에',
 '빛',
 '나는',
 '우리',
 '대한국민',
 '은',
 '3·1운동',
 '으로',
 '건립',
 '되',
 'ㄴ',
 '대한민국임시정부',
 '의',
 '법통',
 '과',
 '불의',
 '에',
 '항거한']

In [37]:
hannanum.pos(c[:66])

[('대한민국헌법', 'N'),
 ('유구', 'N'),
 ('하', 'X'),
 ('ㄴ', 'E'),
 ('역사', 'N'),
 ('와', 'J'),
 ('전통', 'N'),
 ('에', 'J'),
 ('빛', 'N'),
 ('나는', 'J'),
 ('우리', 'N'),
 ('대한국민', 'N'),
 ('은', 'J'),
 ('3·1운동', 'N'),
 ('으로', 'J'),
 ('건립', 'N'),
 ('되', 'X'),
 ('ㄴ', 'E'),
 ('대한민국임시정부', 'N'),
 ('의', 'J'),
 ('법통', 'N'),
 ('과', 'J'),
 ('불의', 'N'),
 ('에', 'J'),
 ('항거한', 'N')]

### twitter

In [38]:
twitter.nouns(c[:66])

['대한민국',
 '헌법',
 '유구',
 '역사',
 '전통',
 '우리',
 '대한',
 '국민',
 '운동',
 '건립',
 '대한민국',
 '임시정부',
 '법',
 '통과',
 '불의',
 '항거']

In [39]:
twitter.morphs(c[:66])

['대한민국',
 '헌법',
 '유구',
 '한',
 '역사',
 '와',
 '전통',
 '에',
 '빛나는',
 '우리',
 '대한',
 '국민',
 '은',
 '3',
 '·',
 '1',
 '운동',
 '으로',
 '건립',
 '된',
 '대한민국',
 '임시정부',
 '의',
 '법',
 '통과',
 '불의',
 '에',
 '항거',
 '한']

In [40]:
twitter.pos(c[:66])

[('대한민국', 'Noun'),
 ('헌법', 'Noun'),
 ('유구', 'Noun'),
 ('한', 'Josa'),
 ('역사', 'Noun'),
 ('와', 'Josa'),
 ('전통', 'Noun'),
 ('에', 'Josa'),
 ('빛나는', 'Verb'),
 ('우리', 'Noun'),
 ('대한', 'Noun'),
 ('국민', 'Noun'),
 ('은', 'Josa'),
 ('3', 'Number'),
 ('·', 'Foreign'),
 ('1', 'Number'),
 ('운동', 'Noun'),
 ('으로', 'Josa'),
 ('건립', 'Noun'),
 ('된', 'Verb'),
 ('대한민국', 'Noun'),
 ('임시정부', 'Noun'),
 ('의', 'Josa'),
 ('법', 'Noun'),
 ('통과', 'Noun'),
 ('불의', 'Noun'),
 ('에', 'Josa'),
 ('항거', 'Noun'),
 ('한', 'Josa')]

# _Scikit-Learn의 문서 전처리 기능_
- - -

- DictVectorizer
- CountVectorizer
- TfidVectorizer
- HashingVectorizer

## DictVectorizer

In [23]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

In [24]:
d = [{'A' : 1, 'B' : 2}, {'B' : 3, "C" : 2}]
dv.fit_transform(d)

array([[1., 2., 0.],
       [0., 3., 2.]])

In [25]:
dv.feature_names_

['A', 'B', 'C']

## CountVectorizer
- - -

- stop_words : 문자열, 리스트, None (디폴트). 문서에서 단어장을 생성할 때 무시할 수 있는 단어. 
- analyzer : 문자열 {‘word’, ‘char’, ‘char_wb’} 또는 함수. 
- token_pattern : string. 토큰 정의용 정규 표현식. 
- tokenizer : 함수 또는 None (디폴트). 토큰 생성 함수.
- ngram_range : (min_n, max_n) 튜플.

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
corpus = [    
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?'
         ]
corpus

['This is the first document.',
 'This is the second second document.',
 'And the third one.',
 'Is this the first document?',
 'The last document?']

In [29]:
cv = CountVectorizer()
cv.fit(corpus)
cv.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [32]:
cv.transform(['This is the first document.']).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1]])

In [34]:
cv.transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 0]])

### Stop Words

In [36]:
vect = CountVectorizer(stop_words=["and", "is", "the", "this"])
vect.fit(corpus)
vect.vocabulary_

{'first': 1, 'document': 0, 'second': 4, 'third': 5, 'one': 3, 'last': 2}

In [37]:
vect = CountVectorizer(stop_words='english').fit(corpus)
vect.vocabulary_

{'document': 0, 'second': 1}

### 토큰 : analyzer, token_pattern, tokenizer

In [38]:
vect = CountVectorizer(analyzer='char').fit(corpus)
vect.vocabulary_

{'t': 16,
 'h': 8,
 'i': 9,
 's': 15,
 ' ': 0,
 'e': 6,
 'f': 7,
 'r': 14,
 'd': 5,
 'o': 13,
 'c': 4,
 'u': 17,
 'm': 11,
 'n': 12,
 '.': 1,
 'a': 3,
 '?': 2,
 'l': 10}

In [39]:
vect = CountVectorizer(token_pattern='t\w+').fit(corpus)
vect.vocabulary_

{'this': 2, 'the': 0, 'third': 1}

In [40]:
import nltk

vect = CountVectorizer(tokenizer=nltk.word_tokenize).fit(corpus)
vect.vocabulary_

{'this': 11,
 'is': 5,
 'the': 9,
 'first': 4,
 'document': 3,
 '.': 0,
 'second': 8,
 'and': 2,
 'third': 10,
 'one': 7,
 '?': 1,
 'last': 6}

## n-gram
- - -

- 단어장 생성에 사용할 크기 결정

In [43]:
vect = CountVectorizer(ngram_range=(1,1)).fit(corpus)
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [42]:
vect = CountVectorizer(ngram_range=(1,2)).fit(corpus)
vect.vocabulary_

{'this': 21,
 'is': 5,
 'the': 14,
 'first': 3,
 'document': 2,
 'this is': 22,
 'is the': 6,
 'the first': 15,
 'first document': 4,
 'second': 11,
 'the second': 17,
 'second second': 13,
 'second document': 12,
 'and': 0,
 'third': 19,
 'one': 10,
 'and the': 1,
 'the third': 18,
 'third one': 20,
 'is this': 7,
 'this the': 23,
 'last': 8,
 'the last': 16,
 'last document': 9}

In [44]:
vect = CountVectorizer(ngram_range=(2,2)).fit(corpus)
vect.vocabulary_

{'this is': 12,
 'is the': 2,
 'the first': 7,
 'first document': 1,
 'the second': 9,
 'second second': 6,
 'second document': 5,
 'and the': 0,
 'the third': 10,
 'third one': 11,
 'is this': 3,
 'this the': 13,
 'the last': 8,
 'last document': 4}

## 빈도수

In [48]:
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)
vect.vocabulary_, vect.stop_words

({'this': 3, 'is': 2, 'first': 1, 'document': 0}, None)

## TF-IDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv.fit_transform(corpus).toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.24151532, 0.        , 0.28709733, 0.        ,
        0.        , 0.85737594, 0.20427211, 0.        , 0.28709733],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

## Hashing Trick

In [50]:
from sklearn.datasets import fetch_20newsgroups
twenty = fetch_20newsgroups()
len(twenty.data)

11314

In [51]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer()

In [56]:
%%time

cv.fit_transform(twenty.data)

CPU times: user 4.37 s, sys: 123 ms, total: 4.5 s
Wall time: 4.6 s


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [53]:
%%time

hv.fit_transform(twenty.data)

CPU times: user 3.46 s, sys: 67.4 ms, total: 3.52 s
Wall time: 3.62 s


<11314x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 1787304 stored elements in Compressed Sparse Row format>