In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras

In [2]:
# 데이터 준비
(X_train, y_train), (X_test, y_test) = tf_keras.datasets.imdb.load_data(num_words=10000) # 10000 개의 단어 집합 사용

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
print(type(X_train))
print(X_train.shape) # (25000, ) : 25000개의 문장 의미
print(type(X_train[0])) # 각 문장은 숫자(단어 번호)의 리스트
print(X_train[0])
print(X_train[1])
print(len(X_train[0]), len(X_train[1])) # 각 문장은 서로 다른 크기의 단어 집합 (one-hot-encoding 이 아님)
print(np.unique(y_train, return_counts=True)) # 긍정 vs 부정

<class 'numpy.ndarray'>
(25000,)
<class 'list'>
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
[1, 194, 115

In [4]:
# 단어 사전 확인

word_to_index = tf_keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print( type( word_to_index ) )
print( list( word_to_index.keys() )[:10] )
print( list( word_to_index.values() )[:10] )

<class 'dict'>
['fawn', 'tsukino', 'nunnery', 'sonja', 'vani', 'woods', 'spiders', 'hanging', 'woody', 'trawling']
[34701, 52006, 52007, 16816, 63951, 1408, 16115, 2345, 2289, 52008]


In [5]:
# 단어 번호로 단어를 찾기 위한 맵 만들기
index_to_word = { value: key for key, value in word_to_index.items() }

print( type( index_to_word ) )
print( list( index_to_word.keys() )[:10] )
print( list( index_to_word.values() )[:10] )

<class 'dict'>
[34701, 52006, 52007, 16816, 63951, 1408, 16115, 2345, 2289, 52008]
['fawn', 'tsukino', 'nunnery', 'sonja', 'vani', 'woods', 'spiders', 'hanging', 'woody', 'trawling']


In [8]:
print( len(index_to_word.keys()) )
print( index_to_word[88583] )
# print( index_to_word[88585] ) # 없는 키의 값을 요청하면 오류
print( index_to_word.get(88585) ) # get을 사용하면 없는 키의 값을 요청했을 때 None 반환
print( index_to_word.get(88585, '?') ) # get을 사용하면 없는 키의 값을 요청했을 때 '?' 반환

88584
voorhees'
None
?


In [12]:
print( X_train[0][:10] ) # 첫 번째 문장의 앞 10개의 단어(번호) 뽑기
print( [ index_to_word.get(i, '?') for i in X_train[0][:10] ] ) # 단어사전에서 번호에 해당하는 단어 뽑기
print( [ index_to_word.get(i-3, '?') for i in X_train[0][:10] ] ) # 단어사전에서 번호에 해당하는 단어 뽑기

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their']
['?', 'this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story']


In [15]:
# 각 문장을 BOW 형식으로 변환 -> (1, 10000)
def vectorize_sentences(sentences, dimension=10000, bow=True): # dimension : column, 전체단어갯수, sentences : 행, 문장들
    results = np.zeros((len(sentences), dimension))

    for i, sentence in enumerate(sentences):
        for word in sentence:
            if bow:
              results[i, word] += 1.  # 단어 위치에 발생 빈도 encoding
            else:
              results[i, word] = 1.   # 단어 위치에 1 encoding

    return results


In [16]:
# 입력 데이터 변환
X_train2 = vectorize_sentences(X_train, bow=False)
X_test2 = vectorize_sentences(X_test, bow=False)