In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Wikipedia is maintained by volunteers.'
doc = nlp(text)

In [3]:
for token in doc:
  print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head) #part of speech 품사

Wikipedia Wikipedia PROPN NNP nsubjpass maintained
is be AUX VBZ auxpass maintained
maintained maintain VERB VBN ROOT maintained
by by ADP IN agent maintained
volunteers volunteer NOUN NNS pobj by
. . PUNCT . punct maintained


In [5]:
spacy.explain('PROPN')

'proper noun'

In [6]:
from IPython.display import HTML
HTML(spacy.displacy.render(doc))

In [7]:
def extract_noun(text):
    doc = nlp(text)
    for token in doc:
        if token.pos_ in {'NOUN', 'PROPN', 'VERB'}:  # 명사, 고유명사, 동사를 선택
            yield token.text     # 문장에서 표시된 형태로 추출
            #yield token.lemma_  # 표제어 추출

In [8]:
list(extract_noun(text))

['Wikipedia', 'maintained', 'volunteers']

### **Stanza**<br>
ko-kr 대한민국<br>
ko-kp 북한 언어<br>


*   작동 속도가 느리다
*   문장이 길어지면 메모리를 많이 사용한다
*   실제 처리 시에는 kiwi를 더 추천한다(명사만 추출하겠다면)



In [None]:
!pip install stanza spacy-stanza

In [None]:
import stanza
nlp = stanza.Pipeline('ko')

In [30]:
doc = nlp('위키백과는 자원봉사자들이 관리를 하고 있다')

In [31]:
for token in doc.sentences[0].words:
  print(token.text, token.lemma, token.upos, token.xpos, token.deprel, token.head)

위키백과는 위키백+과+는 PROPN nq+jct+jxt dislocated 4
자원봉사자들이 자원+봉사자+들+이 NOUN ncn+ncn+xsn+jcs nsubj 4
관리를 관리+를 NOUN ncn+jco obj 4
하고 하+고 VERB pvg+ecx root 0
있다 있+다 AUX px+ef aux 4


In [23]:
def extract_noun(text):
    doc = nlp(text)
    for sent in doc.sentences:
        for token in sent.words:
            for lemma, pos in zip(token.lemma.split('+'), token.xpos.split('+')):
                if pos.startswith('n'):
                    yield lemma

In [24]:
list(extract_noun('오늘 점심에는 맛있는 떡볶이를 먹어야겠다.'))

['오늘', '점심', '떡볶이']

In [32]:
# 단어 성분 분석
doc = nlp('오늘 점심에는 맛있는 떡볶이를 먹어야겠다.')

In [33]:
for token in doc.sentences[0].words:
  print(token.text, token.lemma, token.upos, token.xpos, token.deprel, token.head)

오늘 오늘 NOUN ncn compound 2
점심에는 점심+에+는 NOUN ncpa+jca+jxt dislocated 5
맛있는 맛있+는 ADJ paa+etm amod 4
떡볶이를 떡볶이+를 NOUN ncn+jco obj 5
먹어야겠다 먹+어야겠+다 VERB pvg+ep+ef root 0
. . PUNCT sf punct 5


In [34]:
# 단어 성분 분석
doc = nlp('맛있는 떡볶이')
for token in doc.sentences[0].words:
  print(token.text, token.lemma, token.upos, token.xpos, token.deprel, token.head)

맛있는 맛있+는 ADJ paa+etm amod 2
떡볶이 떡볶+이 NOUN ncn root 0


### kiwi

In [None]:
!pip install kiwipiepy

In [36]:
from kiwipiepy import Kiwi

In [37]:
kiwi = Kiwi()
kiwi.analyze('너무재밓었다그래서보는것을추천한다')

[([Token(form='너무재밓었다', tag='NNG', start=0, len=6),
   Token(form='그래서', tag='MAJ', start=6, len=3),
   Token(form='보', tag='VV', start=9, len=1),
   Token(form='는', tag='ETM', start=10, len=1),
   Token(form='것', tag='NNB', start=11, len=1),
   Token(form='을', tag='JKO', start=12, len=1),
   Token(form='추천', tag='NNG', start=13, len=2),
   Token(form='하', tag='XSV', start=15, len=1),
   Token(form='ᆫ다', tag='EC', start=16, len=1)],
  -76.99913024902344)]

In [38]:
!wget https://github.com/e9t/nsmc/raw/master/ratings_train.txt

--2021-09-27 05:31:57--  https://github.com/e9t/nsmc/raw/master/ratings_train.txt
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt [following]
--2021-09-27 05:31:57--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2021-09-27 05:31:58 (152 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]



In [41]:
import pandas as pd
df = pd.read_csv('ratings_train.txt', sep='\t')
df = df.fillna('')

In [42]:
df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
!pip install tokenizers

In [61]:
import unicodedata

In [44]:
with open('sample.txt', 'w', encoding='utf8') as f:
    for row in df.itertuples():
        f.write(row.document)
        f.write('\n')

In [45]:
from tokenizers import CharBPETokenizer
bpe = CharBPETokenizer(lowercase=True)
bpe.train(files='sample.txt', min_frequency=1, vocab_size=5000)

In [51]:
def tokenizer(text):
    enc = bpe.encode(text)
    for token in enc.tokens:
        token = token.replace('</w>', '')
        if len(token) > 1:
            yield token

In [52]:
enc = bpe.encode('CG가 기가 막힌다!')

enc.tokens

['cg', '가</w>', '기가</w>', '막', '힌', '다</w>', '!</w>']

In [53]:
bpe.save_model('bpe')

['bpe/vocab.json', 'bpe/merges.txt']

In [54]:
!head bpe/merges.txt

#version: 0.2 - Trained by `huggingface/tokenizers`
영 화
영 화</w>
ㅋ ㅋ
니 다</w>
는 데</w>
재 미
지 만</w>
하 고</w>
재 밌


In [55]:
CharBPETokenizer.from_file('bpe/vocab.json', 'bpe/merges.txt')

Tokenizer(vocabulary_size=5000, model=BPE, unk_token=<unk>, suffix=</w>, dropout=None, lowercase=False, unicode_normalizer=None, bert_normalizer=True, split_on_whitespace_only=False)

In [62]:
with open('decomposed.txt', 'w', encoding='utf8') as f:
    for row in df.itertuples():
        f.write(unicodedata.normalize('NFD', row.document))
        f.write('\n')

In [65]:
dbpe = CharBPETokenizer(lowercase=True)
dbpe.train(files='decomposed.txt', min_frequency=1, vocab_size=5000)

In [None]:
dbpe.encode(unicodedata.normalize('NFD', '자연어 처리는 뭈다.')).tokens

In [66]:
from kiwipiepy import Kiwi
kiwi = Kiwi()
def extract_noun(text):
    result = kiwi.analyze(text)
    for lemma, pos, _, _ in result[0][0]:
        if pos.startswith('N'):
            yield lemma

In [67]:
!pip install kiwipiepy

