# 텍스트 전처리

## 6.정수 인코딩

### 1) counter 사용하기

In [1]:
from collections import Counter

In [2]:
sentences = [['barber', 'person'],
 ['barber', 'good', 'person'], 
 ['barber', 'huge', 'person'], 
 ['knew', 'secret'], 
 ['secret', 'kept', 'huge', 'secret'], 
 ['huge', 'secret'], 
 ['barber', 'kept', 'word'],
  ['barber', 'kept', 'word'], 
  ['barber', 'kept', 'secret'], 
  ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], 
  ['barber', 'went', 'huge', 'mountain']]

In [3]:
words = sum(sentences, [])
print(words)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [4]:
vocab = Counter(words)
vocab #단어와 빈도수

Counter({'barber': 8,
         'person': 3,
         'good': 1,
         'huge': 5,
         'knew': 1,
         'secret': 6,
         'kept': 4,
         'word': 2,
         'keeping': 2,
         'driving': 1,
         'crazy': 1,
         'went': 1,
         'mountain': 1})

In [5]:
vocab.most_common(5)
top5 = vocab.most_common(5)
top5

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [6]:
word_to_index = {}
i = 0
for word, freq in top5:
    i += 1
    word_to_index[word] = i

word_to_index # 문자에 숫자 지정할때 1부터 준다

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

### 2) NLTK 의 FreqDist 사용하기

In [7]:
from nltk import FreqDist
import numpy as np

In [8]:
vocab = FreqDist(words)
vocab

FreqDist({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, ...})

In [9]:
top5 = vocab.most_common(5)
top5

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [10]:
import sys
print(sys.version)

3.8.10 (default, May 19 2021, 13:12:57) [MSC v.1916 64 bit (AMD64)]


In [11]:
word_to_index = {}
for i, word in enumerate(top5):
    word_to_index[word[0]] = i+1
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

### 3)Kears 텍스트 전처리

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
tokenizer = Tokenizer()

In [14]:
tokenizer.fit_on_texts(sentences)


In [15]:
tokenizer.word_counts

OrderedDict([('barber', 8),
             ('person', 3),
             ('good', 1),
             ('huge', 5),
             ('knew', 1),
             ('secret', 6),
             ('kept', 4),
             ('word', 2),
             ('keeping', 2),
             ('driving', 1),
             ('crazy', 1),
             ('went', 1),
             ('mountain', 1)])

In [16]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [17]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # 상위 5개 단어만 사용
tokenizer.fit_on_texts(sentences)

In [18]:
print(tokenizer.word_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [19]:
tokenizer.texts_to_sequences(sentences)

[[1, 5],
 [1, 5],
 [1, 3, 5],
 [2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4],
 [1, 4],
 [1, 4, 2],
 [3, 2, 1],
 [1, 3]]

In [20]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
tokenizer.fit_on_texts(sentences)## 지워지는 단어 1에 보관

In [21]:
print(tokenizer.texts_to_sequences(sentences))

[[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


## 7. 패딩

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
tokenizer.fit_on_texts(sentences)
encoded = tokenizer.texts_to_sequences(sentences)
encoded

[[2, 6],
 [2, 1, 6],
 [2, 4, 6],
 [1, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 1],
 [2, 5, 1],
 [2, 5, 3],
 [1, 1, 4, 3, 1, 2, 1],
 [2, 1, 4, 1]]

In [24]:
padded = pad_sequences(encoded)
padded

array([[0, 0, 0, 0, 0, 2, 6],
       [0, 0, 0, 0, 2, 1, 6],
       [0, 0, 0, 0, 2, 4, 6],
       [0, 0, 0, 0, 0, 1, 3],
       [0, 0, 0, 3, 5, 4, 3],
       [0, 0, 0, 0, 0, 4, 3],
       [0, 0, 0, 0, 2, 5, 1],
       [0, 0, 0, 0, 2, 5, 1],
       [0, 0, 0, 0, 2, 5, 3],
       [1, 1, 4, 3, 1, 2, 1],
       [0, 0, 0, 2, 1, 4, 1]])

In [25]:
padded = pad_sequences(encoded, padding='post')
padded

array([[2, 6, 0, 0, 0, 0, 0],
       [2, 1, 6, 0, 0, 0, 0],
       [2, 4, 6, 0, 0, 0, 0],
       [1, 3, 0, 0, 0, 0, 0],
       [3, 5, 4, 3, 0, 0, 0],
       [4, 3, 0, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 1, 0, 0, 0, 0],
       [2, 5, 3, 0, 0, 0, 0],
       [1, 1, 4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0, 0, 0]])

In [26]:
padded = pad_sequences(encoded, padding='post', maxlen=5)
padded

array([[2, 6, 0, 0, 0],
       [2, 1, 6, 0, 0],
       [2, 4, 6, 0, 0],
       [1, 3, 0, 0, 0],
       [3, 5, 4, 3, 0],
       [4, 3, 0, 0, 0],
       [2, 5, 1, 0, 0],
       [2, 5, 1, 0, 0],
       [2, 5, 3, 0, 0],
       [4, 3, 1, 2, 1],
       [2, 1, 4, 1, 0]])

## 8. 원 핫 인코딩 - one-hot encoding

In [27]:
from konlpy.tag import Okt

In [28]:
okt =Okt()
token=okt.morphs('나는 자연어 처리를 배운다')
token

['나', '는', '자연어', '처리', '를', '배운다']

In [29]:
word_to_index = {word: i for i, word in enumerate(token)}
word_to_index

{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '배운다': 5}

In [30]:
from tensorflow.keras.utils import to_categorical
to_categorical(list(word_to_index.values())) #딕셔너리말고 리스트 형태로 들어가야한다
#핫라인 느낌

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

In [31]:
text="나랑 점심 먹으러 갈래 점심 메뉴는 햄버거 갈래 갈래 햄버거 최고야"

t = Tokenizer()
t.fit_on_texts([text])
print(t.word_index) # 각 단어에 대한 인코딩 결과 출력.

{'갈래': 1, '점심': 2, '햄버거': 3, '나랑': 4, '먹으러': 5, '메뉴는': 6, '최고야': 7}


In [32]:
encoded=t.texts_to_sequences([text])[0]
encoded

[4, 2, 5, 1, 2, 6, 3, 1, 1, 3, 7]

In [33]:
to_categorical(encoded)

array([[0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

## 9. 데이터의 분리

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
from sklearn.datasets import load_iris
iris = load_iris()

In [36]:
iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [37]:
iris.target[:5]

array([0, 0, 0, 0, 0])

In [38]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [39]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=2021
)


In [41]:
X_train.shape, X_test.shape, y_test.shape, y_train.shape

((120, 4), (30, 4), (30,), (120,))

In [42]:
print(y_test)

[0 0 1 0 0 0 0 0 0 0 0 1 2 2 1 2 1 1 0 1 1 2 2 0 2 1 1 1 0 0]


In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target,stratify=iris.target ,test_size=0.2, random_state=2021
)
print(y_test)

[0 1 1 2 0 1 0 1 2 0 1 1 1 2 2 0 2 0 2 0 1 2 0 2 2 0 1 1 2 0]
