## 1. 형태소분석 : NLTK

In [10]:
#nltk 설치하기
#!pip install nltk

In [27]:
#nltk 실행하기
import nltk
from nltk import pos_tag
nltk.download("punkt_tab")
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [22]:
#샘플 텍스트
text = 'The little yellow dog barked at the Persian cat'

In [23]:
split_text = nltk.word_tokenize(text)
split_text

['The', 'little', 'yellow', 'dog', 'barked', 'at', 'the', 'Persian', 'cat']

In [29]:
tag_text = nltk.pos_tag(split_text)
tag_text

[('The', 'DT'),
 ('little', 'JJ'),
 ('yellow', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Persian', 'JJ'),
 ('cat', 'NN')]

##### ♦️1. 실습예제
  - 명사만 추출하는 define 함수를 만들어보자!

In [60]:
def define(text):
    split_text = nltk.word_tokenize(text)
    text_tag = nltk.pos_tag(split_text)
    
    return [word for word, tag in text_tag if tag in ['NN','NNS','NNP','NNPS']]  

In [62]:
define(text)

['dog', 'cat']

### 2.1. NLTK 불용어 사용하기

In [65]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [66]:
stopwords = nltk.corpus.stopwords.words('english')

In [67]:
stopwords

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

### 2.2 소문자변환

In [64]:
text.lower()

'the little yellow dog barked at the persian cat'

##### ♦️2. 실습예제
  - 영어 불용어 적용 명사 define 만들기

In [73]:
def define(text):
    split_text = nltk.word_tokenize(text)
    text_tag = nltk.pos_tag(split_text)
    
    return [word for word, tag in text_tag 
            if (tag in ['NN','NNS','NNP','NNPS']) and (word.lower() not in stopwords) ]  

In [74]:
text = 'ABOVE AFTER AGAIN CAT DOG'
define(text)

['CAT', 'DOG']

## 2. 형태소분석 : Konlpy

In [None]:
## 코랩 konlpy 실행
# !curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [75]:
from konlpy.tag import Okt
okt=Okt()

In [83]:
#문장 정규화
#okt.morphs(text, norm=True)
#okt.pos(text, norm = True)

#어간 추출
#okt.morphs(text,stem = True)
#okt.pos(text,stem=True)

#둘 다
#okt.morphs(text, norm=True, stem=True)
#okt.pos(text, norm=True, stem=True)


In [78]:
okt.tagset

{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}

In [82]:
#정규화
text = '오늘 나는 친구랑 영화를 볼 것이닼ㅋㅋ'
okt.pos(text, norm = True)

[('오늘', 'Noun'),
 ('나', 'Noun'),
 ('는', 'Josa'),
 ('친구', 'Noun'),
 ('랑', 'Josa'),
 ('영화', 'Noun'),
 ('를', 'Josa'),
 ('볼', 'Noun'),
 ('것', 'Noun'),
 ('이다', 'Josa'),
 ('ㅋㅋ', 'KoreanParticle')]

In [84]:
#어간추출
text = '맛있는 바나나를 먹었다.'
okt.morphs(text, stem= True)

['맛있다', '바나나', '를', '먹다', '.']

In [86]:
#둘다
text = '맛있는 바나나를 먹었닼ㅋㅋㅋㅋ'
okt.morphs(text, stem= True,norm = True)

['맛있다', '바나나', '를', '먹다', 'ㅋㅋㅋ']

##### ♦️3. 실습예제
  - 한국어 불용어 적용된 명사/형용사/동사를 추출하는 define 만들기

In [None]:
def define(text):

## 3. 형태소분석 : Kiwi

In [88]:
#!pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.21.0-cp313-cp313-win_amd64.whl.metadata (1.3 kB)
Collecting kiwipiepy_model<0.22,>=0.21 (from kiwipiepy)
  Downloading kiwipiepy_model-0.21.0.tar.gz (35.5 MB)
     ---------------------------------------- 0.0/35.5 MB ? eta -:--:--
     ------- -------------------------------- 6.6/35.5 MB 39.5 MB/s eta 0:00:01
     -------------- ------------------------ 13.1/35.5 MB 34.7 MB/s eta 0:00:01
     ------------------------ -------------- 22.3/35.5 MB 39.0 MB/s eta 0:00:01
     ---------------------------------- ---- 31.2/35.5 MB 39.3 MB/s eta 0:00:01
     ---------------------------------------- 35.5/35.5 MB 37.8 MB/s  0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading kiwipiepy-0.21.0-cp313-cp313-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 2.4/2.4 MB 11.4 MB/s  0:00:00
Building wheels

  DEPRECATION: Building 'kiwipiepy_model' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'kiwipiepy_model'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [89]:
from kiwipiepy import Kiwi
kiwi = Kiwi()

In [90]:
kiwi.tokenize('아버지가 방에 들어가신다')

[Token(form='아버지', tag='NNG', start=0, len=3),
 Token(form='가', tag='JKS', start=3, len=1),
 Token(form='방', tag='NNG', start=5, len=1),
 Token(form='에', tag='JKB', start=6, len=1),
 Token(form='들어가', tag='VV', start=8, len=3),
 Token(form='시', tag='EP', start=11, len=1),
 Token(form='ᆫ다', tag='EF', start=11, len=2)]

In [92]:
#토큰만 뽑기
kiwi.tokenize('아버지가 방에 들어가신다')[0][0]

'아버지'

In [94]:
#형태소 뽑기
result = [i[0] for i in kiwi.tokenize('아버지가 방에 들어가신다') if i[1] in ['NNG']]

In [95]:
result

['아버지', '방']

### 3.1 kiwi 불용어 사전 다루기

In [96]:
from kiwipiepy.utils import Stopwords

In [98]:
stopwords = Stopwords()

In [100]:
kiwi.tokenize('분석결과에서 불용어만 제외하고 출력해보자', stopwords = stopwords)

[Token(form='분석', tag='NNG', start=0, len=2),
 Token(form='결과', tag='NNG', start=2, len=2),
 Token(form='불', tag='NNG', start=7, len=1),
 Token(form='용어', tag='NNG', start=8, len=2),
 Token(form='제외', tag='NNG', start=12, len=2),
 Token(form='출력', tag='NNG', start=17, len=2),
 Token(form='보', tag='VX', start=20, len=1),
 Token(form='자', tag='EF', start=21, len=1)]

In [102]:
stopwords.add('결과')

In [103]:
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('결과', 'NNP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),

### 3.2 kiwi 띄어쓰기 및 문장 분리

In [104]:
kiwi.space("나는텍스트마이닝을공부하고있어요")

'나는 텍스트 마 이닝을 공부하고 있어요'

In [105]:
kiwi.split_into_sents("나는텍스트마이닝을공부하고있어요")

[Sentence(text='나는텍스트마이닝을공부하고있어요', start=0, end=16, tokens=None, subs=[])]