# 텍스트 전처리
- 패키지 설치
    - NLTK : pip install nltk
    - KoNLP

In [122]:
# ! pip install --user --upgrade pip

In [123]:
# ! pip install nltk

In [124]:
# ! pip install konlpy

In [125]:
import nltk
import konlpy

In [126]:
# nltk LookupError
# nltk.download()

##  [1] 토큰화(Tokenize)
- 문장/ 문서를 의미를 지닌 작은 단위로 나뉘는 것
- 나누어진 단어를 토큰(Token)이라 함.
- 종류
    - 문장 토큰화
    - 단어 토큰화

In [127]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [128]:
raw_text_01='Caution: when tokenizing a Unicode string,\
make sure you are not using an encoded version of the string (it may be necessary to decode it first,\
e.g. with s.decode("utf8").'

raw_text_02="""Return a tokenized copy of text, using NLTK’s recommended word tokenizer (currently an improved TreebankWordTokenizer along with PunktSentenceTokenizer for the specified language).
"""

raw_text_03="This particular tokenizer requires the Punkt sentence tokenization models to be installed.\
NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation:"

In [129]:
# 단어 단위 토큰화 word_tokenize
result_01=word_tokenize(raw_text_01)
print(result_01)

['Caution', ':', 'when', 'tokenizing', 'a', 'Unicode', 'string', ',', 'make', 'sure', 'you', 'are', 'not', 'using', 'an', 'encoded', 'version', 'of', 'the', 'string', '(', 'it', 'may', 'be', 'necessary', 'to', 'decode', 'it', 'first', ',', 'e.g', '.', 'with', 's.decode', '(', '``', 'utf8', "''", ')', '.']


In [130]:
raw_text_list=[raw_text01, raw_text_02, raw_text_03]
print(raw_text_list)

['Caution: when tokenizing a Unicode string, make sure you are not using an encoded version of the string (it may be necessary to decode it first, e.g. with s.decode("utf8").', 'Return a tokenized copy of text, using NLTK’s recommended word tokenizer (currently an improved TreebankWordTokenizer along with PunktSentenceTokenizer for the specified language).\n', 'This particular tokenizer requires the Punkt sentence tokenization models to be installed.NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation:']


In [131]:
# sent_tokenize
st = sent_tokenize(raw_text_01)
print(st, len(st))

['Caution: when tokenizing a Unicode string,make sure you are not using an encoded version of the string (it may be necessary to decode it first,e.g.', 'with s.decode("utf8").'] 2


### 여러 문장에 토큰 추출

In [132]:
# # 문장 단위로 추출
# for sent in raw_text_list:
#     total_token=[] # 중복 안하려면 set()
    
#     # 문장 추출
#     sentResult=sent_tokenize(sent)
    
#     # 문장에서 추출한 토큰
#     print(f'sent => {sentResult}')
        
#     for ele in sent:
#         print(f'ele => {ele}')
#         wordResult=word_tokenize(ele)
#         print(f'wordResult => {wordResult}')
#         total_token.append(wordResult)

# print(f'total_token => {total_token}')

## [2] 정제 & 정규화
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화
    - 대문자 또는 소문자로 통일
    - 문장의 길이

In [133]:
## [2-1] 불용어 (Stopword)

In [134]:
# nltk.corpus.stopwords

In [135]:
en_stopwords=nltk.corpus.stopwords.words('english')
len(en_stopwords)

179

In [136]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

## [2-2] 어간 및 표제어 처리

In [137]:
from nltk.stem import LancasterStemmer # 어간만 자름

In [138]:
lstem=LancasterStemmer()

In [139]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [140]:
lstem.stem('amuse'), lstem.stem('amused') # amuse 원래 단어의 일부가 잘리는 현상 발생

('amus', 'amus')

In [141]:
# 표제어(사전에 등록된 단어 추출)

In [142]:
from nltk.stem import WordNetLemmatizer

In [143]:
wlema=WordNetLemmatizer()

In [144]:
wlema.lemmatize('working', 'v'), wlema.lemmatize('worked', 'v')

('work', 'work')

In [145]:
wlema.lemmatize('amusing', 'v'), wlema.lemmatize('amused', 'v')

('amuse', 'amuse')

## [3] 텍스트 벡터화
- 텍스트를 수치화
- 희소벡터(OHE) : BOW 방식 --> Count 기반, TF-IDF 기반
- 밀집벡터(공간행렬) : Embedding 방식, Word2Vect

In [146]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [147]:
corpus=[raw_text_01, raw_text_02]

In [148]:
ohe=CountVectorizer()

In [149]:
result=ohe.fit_transform(corpus)
print(result)

  (0, 4)	1
  (0, 37)	1
  (0, 31)	1
  (0, 33)	1
  (0, 24)	2
  (0, 14)	1
  (0, 25)	1
  (0, 40)	1
  (0, 2)	1
  (0, 18)	1
  (0, 34)	1
  (0, 1)	1
  (0, 8)	1
  (0, 36)	1
  (0, 19)	1
  (0, 27)	1
  (0, 12)	2
  (0, 15)	1
  (0, 3)	1
  (0, 16)	1
  (0, 28)	1
  (0, 7)	2
  (0, 9)	1
  (0, 38)	1
  (0, 35)	1
  (1, 34)	1
  (1, 1)	1
  (1, 19)	1
  (1, 27)	1
  (1, 38)	1
  (1, 22)	1
  (1, 29)	1
  (1, 5)	1
  (1, 26)	1
  (1, 17)	1
  (1, 21)	1
  (1, 39)	1
  (1, 30)	1
  (1, 6)	1
  (1, 11)	1
  (1, 32)	1
  (1, 0)	1
  (1, 20)	1
  (1, 10)	1
  (1, 23)	1
  (1, 13)	1


In [150]:
result=result.toarray()
print(result.shape, result)

(2, 41) [[0 1 1 1 1 0 0 2 1 1 0 0 2 0 1 1 1 0 1 1 0 0 0 0 2 1 0 1 1 0 0 1 0 1 1 1
  1 1 1 0 1]
 [1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 0
  0 0 1 1 0]]


In [151]:
## DF-IDF 기반
tfldf=TfidfVectorizer()

In [152]:
tf_corpus=tfldf.fit_transform(corpus)

In [153]:
type(tf_corpus)

scipy.sparse.csr.csr_matrix

In [154]:
tf_corpus=tf_corpus.toarray()

In [155]:
print(tf_corpus)

[[0.         0.12670961 0.17808593 0.17808593 0.17808593 0.
  0.         0.35617186 0.17808593 0.17808593 0.         0.
  0.35617186 0.         0.17808593 0.17808593 0.17808593 0.
  0.17808593 0.12670961 0.         0.         0.         0.
  0.35617186 0.17808593 0.         0.12670961 0.17808593 0.
  0.         0.17808593 0.         0.17808593 0.12670961 0.17808593
  0.17808593 0.17808593 0.12670961 0.         0.17808593]
 [0.23229935 0.1652829  0.         0.         0.         0.23229935
  0.23229935 0.         0.         0.         0.23229935 0.23229935
  0.         0.23229935 0.         0.         0.         0.23229935
  0.         0.1652829  0.23229935 0.23229935 0.23229935 0.23229935
  0.         0.         0.23229935 0.1652829  0.         0.23229935
  0.23229935 0.         0.23229935 0.         0.1652829  0.
  0.         0.         0.1652829  0.23229935 0.        ]]


In [157]:
# ! pip install -U spacy
# https://spacy.io/

In [158]:
## 토크나이저 객체 생성

In [159]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [165]:
raw_text

array(['Caution: when tokenizing a Unicode string, make sure you are not using an encoded version of the string (it may be necessary to decode it first, e.g. with s.decode("utf8").',
       'Return a tokenized copy of text, using NLTK’s recommended word tokenizer (currently an improved TreebankWordTokenizer along with PunktSentenceTokenizer for the specified language).\n'],
      dtype='<U181')

In [166]:
# 토큰으로 나누기
tokens=text_to_word_sequence(raw_text)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [167]:
print(len(tokens, tes))

NameError: name 'tokens' is not defined

https://codetorial.net/tensorflow/natural_language_processing_in_tensorflow_01.html
https://wikidocs.net/21698
https://datascienceschool.net/03%20machine%20learning/03.01.01%20NLTK%20%EC%9E%90%EC%97%B0%EC%96%B4%20%EC%B2%98%EB%A6%AC%20%ED%8C%A8%ED%82%A4%EC%A7%80.html

https://happygrammer.github.io/nlp/nltk/
https://mr-doosun.tistory.com/23

## 제공한 문서/문장에 대한 단어사전voca
voca에 존재하지 않는 단어 Out of Voca : oov

In [169]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'
]

In [170]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [171]:
tokenizer=Tokenizer(num_words=100)

In [172]:
tokenizer.fit_on_texts(sentences)

In [174]:
word_index=tokenizer.word_index

In [175]:
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [176]:
sequences=tokenizer.texts_to_sequences(sentences)

In [177]:
print(sequences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


## ohe 변환
- sklearn OneHotEncoding객체 생성
- keras 함수

In [180]:
from tensorflow.keras.utils import to_categorical

In [None]:
to_categorical()