<a href="https://colab.research.google.com/github/roi-mason/nlp-basics/blob/main/nlp_basics_001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# nlp **basics**

# 자연어처리 **기초**

* 예제로 배우는 자연어 처리 기초
  - NLP 알고리즘, 텍스트 분류와 요약, 감성 분석
  - 쇼홈 고시 , 드와이트 거닝 (2020)
  - 에이콘 출판


# 예제 12: 텍스트 정제와 **토큰화**
* p. 64

In [None]:
# 12: 텍스트 정제와 토큰화
# sample sentence from The Project Gutenberg eBook of Metamorphosis with the some symbols or numbers I added


import re
sentence = '''
One morning, when "Gregor $$ Samsa" woke from troubled dreams\\, he found
himself transformed in his bed into a horrible vermin. ;;//"" \\ He lay on his
armour-like back, and if 29857  he lifted his head #$ ~ a little he could see his
brown belly, slightly domed and divided by arches into stiff sections.
'''

# 숫자, 알파벳 문자, 공백 문자이외의 모든 문자 제거
# 숫자는 남겨짐!
# 공백이 있는 곳마다 문자열 분할
# 텍스트에서 토큰 추출
re.sub(r'([^\s\w]|_)+', ' ', sentence).split()


['One',
 'morning',
 'when',
 'Gregor',
 'Samsa',
 'woke',
 'from',
 'troubled',
 'dreams',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin',
 'He',
 'lay',
 'on',
 'his',
 'armour',
 'like',
 'back',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections']

# 예제 13: n-gram **추출**
* p. 67~70

  - 사용자 정의 함수
  - nltk
  - TextBlob


In [None]:
#import re
#사용자정의 함수

def n_gram_extractor(sentence, n):
  tokens = re.sub(r'([^\s\s]|_)+', ' ', sentence).split()
  for i in range(len(tokens)-n+1):
    print(tokens[i:i+n])


In [None]:
# bigram 출력
# 실행결과?????
n_gram_extractor('The cute little boy is playing with the kitten.', 2)


# nltk 라이브러리 - **ngram**

In [None]:
# nltk ngrams 사용
# bigram

from nltk import ngrams # ngrams 복수형!
list(ngrams('The cute little boy is playing with the kitten.'.split(), 2))

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'boy'),
 ('boy', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten.')]

In [None]:
# 3-grams

from nltk import ngrams
list(ngrams('The cute little boy is playing with the kitten.'.split(), 3))

[('The', 'cute', 'little'),
 ('cute', 'little', 'boy'),
 ('little', 'boy', 'is'),
 ('boy', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten.')]

# **TextBlob 사용**
* Keras와 TextBlob은 다양한 NLP 작업에 많이 사용됨
* TextBlob: 간단하고, 사용하기 쉬운 인터페이스
* Keras: 딥러닝 기반 NLP 작업에 사용

In [None]:
# TextBlob 사용
# punkt 필수


import nltk
nltk.download('punkt')
from textblob import TextBlob
blob = TextBlob("The cute little boy is playing with the kitten.")
blob.ngrams(n=2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'boy']),
 WordList(['boy', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'with']),
 WordList(['with', 'the']),
 WordList(['the', 'kitten'])]

# 예제 14: Keras와 TextBlob

In [None]:
# sentence

sentence = '''
One morning, when "Gregor $$ Samsa" woke from troubled dreams\\, he found
himself transformed in his bed into a horrible vermin. ;;//"" \\ He lay on his
armour-like back, and if 29857  he lifted his head #$ ~ a little he could see his
brown belly, slightly domed and divided by arches into stiff sections.
'''

In [None]:
# Keras, TextBlob 라이브러리 호출

from keras.preprocessing.text import text_to_word_sequence
from textblob import TextBlob

In [None]:
# keras를 이용한 토큰화

text_to_word_sequence(sentence)

['one',
 'morning',
 'when',
 'gregor',
 'samsa',
 'woke',
 'from',
 'troubled',
 'dreams',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin',
 'he',
 'lay',
 'on',
 'his',
 'armour',
 'like',
 'back',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections']

In [None]:
# TextBlob 라이브러리 사용한 토큰화

blob = TextBlob(sentence)
blob.words

WordList(['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour-like', 'back', 'and', 'if', '29857', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections'])

# 토크나이저 **종류**
* tweet tokenizer
* MWE tokenizer
  - Multi-Word Expression tokenizer
  - the United States of America 등을 하나의 객체로 처리
* regext tokenizer
* whitespace tokenizer
  - 공백, 탭, 줄 바꿈 문자 등을 기준으로 문자열 분할
* word punkt tokenizer
  - 텍스트를 알파벳문자, 숫자, 알파벳 이외의 문자 리스트로 분할

In [None]:
# tweet tokenizer
# 기호 등이 그대로 남도록 처리됨

sentence = '''
One morning, when "Gregor $$ Samsa" woke from troubled dreams\\, he found
himself transformed in his bed into a horrible vermin. ;;//"" \\ He lay on his
armour-like back, and if 29857  he lifted his head #$ ~ a little he could see his
brown belly, slightly domed and divided by arches into stiff sections.
'''

from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence)

['One',
 'morning',
 ',',
 'when',
 '"',
 'Gregor',
 '$',
 '$',
 'Samsa',
 '"',
 'woke',
 'from',
 'troubled',
 'dreams',
 '\\',
 ',',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin',
 '.',
 ';',
 ';/',
 '/',
 '"',
 '"',
 '\\',
 'He',
 'lay',
 'on',
 'his',
 'armour-like',
 'back',
 ',',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 '#',
 '$',
 '~',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly',
 ',',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections',
 '.']

In [None]:
# MWE Tokenizer
# 토큰 다음에 문장부호가 있는 경우 별도 처리를 하지 않으면 멀티워드로 토큰처리가 안됨
# 다음 코드 참조

from nltk.tokenize import MWETokenizer

mwe_tokenizer = MWETokenizer()
mwe_tokenizer = MWETokenizer([('stiff', 'sections')])
mwe_tokenizer.add_mwe(('horrible', 'vermin'))
mwe_tokenizer.add_mwe(('his', 'bed'))
mwe_tokenizer.tokenize(sentence.split())

['One',
 'morning,',
 'when',
 '"Gregor',
 '$$',
 'Samsa"',
 'woke',
 'from',
 'troubled',
 'dreams\\,',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his_bed',
 'into',
 'a',
 'horrible',
 'vermin.',
 ';;//""',
 '\\',
 'He',
 'lay',
 'on',
 'his',
 'armour-like',
 'back,',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 '#$',
 '~',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly,',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections.']

In [None]:
# 문장부호 때문에 멀티워드로 처리가 안되는 경우,
# 아래처럼 문장부호를 제거하는 코드 삽입

mwe_tokenizer.tokenize(sentence.replace('.', '').split())

['One',
 'morning,',
 'when',
 '"Gregor',
 '$$',
 'Samsa"',
 'woke',
 'from',
 'troubled',
 'dreams\\,',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible_vermin',
 ';;//""',
 '\\',
 'He',
 'lay',
 'on',
 'his',
 'armour-like',
 'back,',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 '#$',
 '~',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly,',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff_sections']

In [None]:
# Regex Tokenizer



In [None]:
# Whitespace Tokenizer

from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence)

['One',
 'morning,',
 'when',
 '"Gregor',
 '$$',
 'Samsa"',
 'woke',
 'from',
 'troubled',
 'dreams\\,',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin.',
 ';;//""',
 '\\',
 'He',
 'lay',
 'on',
 'his',
 'armour-like',
 'back,',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 '#$',
 '~',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly,',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections.']

In [None]:
# Punct Tokenizer

from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence)

['One',
 'morning',
 ',',
 'when',
 '"',
 'Gregor',
 '$$',
 'Samsa',
 '"',
 'woke',
 'from',
 'troubled',
 'dreams',
 '\\,',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin',
 '.',
 ';;//""',
 '\\',
 'He',
 'lay',
 'on',
 'his',
 'armour',
 '-',
 'like',
 'back',
 ',',
 'and',
 'if',
 '29857',
 'he',
 'lifted',
 'his',
 'head',
 '#$',
 '~',
 'a',
 'little',
 'he',
 'could',
 'see',
 'his',
 'brown',
 'belly',
 ',',
 'slightly',
 'domed',
 'and',
 'divided',
 'by',
 'arches',
 'into',
 'stiff',
 'sections',
 '.']