In [8]:
text = "John likes to watch movies. Mary likes movies too.\
 Mary also likes to watch football games."
words = text.replace('.', '').split()
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [9]:
import numpy as np
word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
        'to', 'too', 'watch'], dtype='<U8'),
 array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))

In [10]:
#딕셔너리 TF 생성
word_to_cnt = {}
for word, cnt in zip(*word_count):
  word_to_cnt[word] = cnt
  
word_to_cnt

{np.str_('John'): np.int64(1),
 np.str_('Mary'): np.int64(2),
 np.str_('also'): np.int64(1),
 np.str_('football'): np.int64(1),
 np.str_('games'): np.int64(1),
 np.str_('likes'): np.int64(3),
 np.str_('movies'): np.int64(2),
 np.str_('to'): np.int64(2),
 np.str_('too'): np.int64(1),
 np.str_('watch'): np.int64(2)}

In [11]:
word_to_cnt['movies']

np.int64(2)

In [12]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]

# TDM

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
dtm_array = vector.fit_transform(corpus).toarray()
dtm_array

array([[0, 0, 0, 1, 2, 1, 2, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 1, 0, 1]])

In [14]:
print(vector.vocabulary_)

{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [18]:
import pandas as pd

tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item : item[1]))
tf_dic_sorted

pd.DataFrame(dtm_array, columns=tf_dic_sorted.keys())

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# TfidfVectorizer 객체 생성
tfidf_vec = TfidfVectorizer()

# fit_transform을 사용하여 텍스트 데이터를 TF-IDF 행렬로 변환
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
# print(tfidf_array)

# 단어 사전 (vocabulary_)을 사용하여 단어와 인덱스 매핑
tfidf_dic = tfidf_vec.vocabulary_
# print(tfidf_dic)

# 단어 사전 (tfidf_dic)을 TF-IDF 값(item[1])을 기준으로 정렬
# lambda 함수는 정렬의 기준(key)을 설정하며, item[1]은 딕셔너리의 값(value)을 의미
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))
# print(tfidf_dic_sorted)

# TF-IDF 행렬을 DataFrame으로 변환
# columns는 정렬된 단어 사전의 키(단어)를 사용
tfidf_dtm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())
# print(tfidf_dtm)

# 결과 출력
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [31]:
tfidf_array

array([[0.        , 0.        , 0.        , 0.32369906, 0.46062909,
        0.23031454, 0.64739811, 0.23031454, 0.32369906, 0.23031454],
       [0.44610081, 0.44610081, 0.44610081, 0.        , 0.3174044 ,
        0.3174044 , 0.        , 0.3174044 , 0.        , 0.3174044 ]])

In [32]:
tfidf_dic

{'john': 3,
 'likes': 4,
 'to': 7,
 'watch': 9,
 'movies': 6,
 'mary': 5,
 'too': 8,
 'also': 0,
 'football': 1,
 'games': 2}

In [33]:
tfidf_dic_sorted

{'also': 0,
 'football': 1,
 'games': 2,
 'john': 3,
 'likes': 4,
 'mary': 5,
 'movies': 6,
 'to': 7,
 'too': 8,
 'watch': 9}

In [34]:
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [36]:
pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.3.0.post1-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m4.2 MB/s[0m  [33m0:00:06[0mm0:00:01[0m00:01[0m
[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m3.6 MB/s[0m  [33m0:00:05[0mm0:00:01[0m00:01[0m
[

kernal restart

In [1]:
from gensim.models import Word2Vec

In [7]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]
word_list = []

In [9]:
for word in corpus:
    word_list.append(word.replace('.','').split())

word_list

[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games'],
 ['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [12]:
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.1716582030057907),
 ('also', 0.06598272919654846),
 ('Mary', 0.00861525908112526),
 ('watch', -0.06767924129962921),
 ('games', -0.08545631915330887),
 ('football', -0.08945832401514053),
 ('too', -0.11875486373901367),
 ('to', -0.13642998039722443)]

In [14]:
model.wv.most_similar('John','Mary')

[('likes', 0.15334244072437286),
 ('football', 0.07839599251747131),
 ('also', 0.015081927180290222),
 ('too', 0.007014104165136814),
 ('movies', -0.006439352408051491),
 ('games', -0.07737728208303452),
 ('to', -0.12004965543746948),
 ('watch', -0.16034363210201263)]

In [15]:
model.wv.most_similar('games')

[('to', 0.13887983560562134),
 ('watch', 0.13149002194404602),
 ('movies', 0.06409990042448044),
 ('too', 0.06059820577502251),
 ('football', 0.019152285531163216),
 ('Mary', 0.009408489800989628),
 ('also', -0.05774582177400589),
 ('likes', -0.05987628549337387),
 ('John', -0.10513809323310852)]

# 실습 - 영화평 분류하기( imdb 데이터셋 )

In [16]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000)
(X_train, y_train), (X_test, y_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


((array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
         list([1, 194, 1153, 194, 8255, 78, 

In [None]:
#  train, test 데이터 전처리
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train, maxlen=80, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='post', truncating='post')

In [None]:
# dnn
from tensorflow.keras import Sequential, layers

model_dnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    layers.Flatten(), # 1차원으로 평탄화
    layers.Dense(64, activation='relu'),
    layers.Dense(2, activation='softmax')
    ]
)

model_dnn.summary()

I0000 00:00:1756966471.915064   11386 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [None]:
# dnn 
model_dnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_dnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)

Epoch 1/10


2025-09-04 15:24:43.153239: I external/local_xla/xla/service/service.cc:163] XLA service 0x7fbac80063f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-04 15:24:43.153267: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-09-04 15:24:43.185502: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-04 15:24:43.298461: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200




[1m 41/125[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5178 - loss: 0.6918

I0000 00:00:1756967088.745007   15698 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.6836 - loss: 0.5694
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8818 - loss: 0.2857
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9796 - loss: 0.0855
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9986 - loss: 0.0150
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9997 - loss: 0.0044
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0019
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0012
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 1.0000 - loss: 8.0223e-04
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fbbc3bd5ea0>

In [None]:
# dnn
dnn_loss, dnn_accuracy = model_dnn.evaluate(X_test_pad, y_test)
print(f"Loss: {dnn_loss:.4f}, Accuacy : {dnn_accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7798 - loss: 0.9074


[0.9073679447174072, 0.7797600030899048]

In [34]:
# RNN 모델
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64),
    layers.Dense(2, activation='softmax')
    ]
)

model_rnn.summary()

In [35]:
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)

Epoch 1/10


2025-09-04 15:27:39.506860: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:27:39.506920: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-09-04 15:27:39.506938: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.








[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.5166 - loss: 0.6934
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.7661 - loss: 0.4969
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.8944 - loss: 0.2702
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.9582 - loss: 0.1196
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.9898 - loss: 0.0401
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9940 - loss: 0.0263
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.9956 - loss: 0.0186
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9908 - loss: 0.0288
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fbbc517b190>

In [None]:
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7490 - loss: 1.1472
Loss: 1.1472, Accuacy : 0.7490


In [38]:
# RNN 모델 2
from tensorflow.keras import Sequential, layers

model_rnn_2=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)

model_rnn_2.summary()

In [None]:
# rnn_2 모델 compile
model_rnn_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn_2.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_2_loss, rnn_2_accuracy = model_rnn_2.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_2_loss:.4f}, Accuacy : {rnn_2_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 48ms/step - accuracy: 0.9936 - loss: 0.0183
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.9977 - loss: 0.0086
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.9983 - loss: 0.0058
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.9943 - loss: 0.0163
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.9958 - loss: 0.0122
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.9966 - loss: 0.0106
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.9966 - loss: 0.0111
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.9977 - loss: 0.0078
Epoch 9/10
[1m125/125[0m [32

In [41]:
# RNN 모델 3
from tensorflow.keras import Sequential, layers

model_rnn_3=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)


# rnn 모델 compile
model_rnn_3.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn_3.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_3_loss, rnn_3_accuracy = model_rnn_3.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_3_loss:.4f}, Accuacy : {rnn_3_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 53ms/step - accuracy: 0.5038 - loss: 0.7001
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 49ms/step - accuracy: 0.5076 - loss: 0.6953
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.5120 - loss: 0.6945
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.5079 - loss: 0.6942
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 0.5170 - loss: 0.6928
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 54ms/step - accuracy: 0.5170 - loss: 0.6928
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.5198 - loss: 0.6916
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.5286 - loss: 0.6910
Epoch 9/10
[1m125/125[0m [32

dnn : Loss : 0.9073, Accuacy: 0.7797 # DNN   
rnn : Loss: 1.1472, Accuacy : 0.7490 # RNN

rnn_2 : Loss: 2.0934, Accuacy : 0.7184 # layers.SimpleRNN(128)  과대적합

0. 비교군 : Loss: 3.0997, Accuacy : 0.5097   
1. 옵티마지어 sgd : Loss: 0.6958, Accuacy : 0.5080 # optimizer : sgd     
2. 전체 단어의 개수 1000 : Loss: 0.9225, Accuacy : 0.7269    
3. 영화평의 길이를 200개로 바꾸기 : Loss: 0.6928, Accuacy : 0.5038    
4. pad_sequence의 truncating과 padding을 pre로 바꾸기 : Loss: 1.2778, Accuacy : 0.7986 # 과대적합   
5. RNN 층(뉴런 128개)을 하나 더 추가 : Loss: 0.5368, Accuacy : 0.7324   

# 비교군 

In [54]:
from tensorflow.keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000)
(X_train, y_train), (X_test, y_test)

#  train, test 데이터 전처리

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=80, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='post', truncating='post')

In [55]:
# RNN 모델 
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - accuracy: 0.5015 - loss: 0.6995
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.5930 - loss: 0.6718
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.7660 - loss: 0.4823
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.9186 - loss: 0.1982
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.9716 - loss: 0.0722
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.9870 - loss: 0.0337
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.9930 - loss: 0.0202
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 52ms/step - accuracy: 0.9930 - loss: 0.0188
Epoch 9/10
[1m125/125[0m [32

# 단어 1000개만 가져오기

In [56]:
from tensorflow.keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 1000)
(X_train, y_train), (X_test, y_test)

#  train, test 데이터 전처리

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=80, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='post', truncating='post')

In [57]:
# RNN 모델 
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=1000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 45ms/step - accuracy: 0.5087 - loss: 0.6992
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.6602 - loss: 0.6082
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.7759 - loss: 0.4833
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.7949 - loss: 0.4550
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8254 - loss: 0.3948
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8440 - loss: 0.3616
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.8743 - loss: 0.3058
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.8938 - loss: 0.2652
Epoch 9/10
[1m125/125[0m [32

# 영화평의 길이를 200개로 바꾸기

In [58]:
from tensorflow.keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000)
(X_train, y_train), (X_test, y_test)

#  train, test 데이터 전처리

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=200, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=200, padding='post', truncating='post')

In [59]:
# RNN 모델 
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(200,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 103ms/step - accuracy: 0.5003 - loss: 0.7065
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 103ms/step - accuracy: 0.5190 - loss: 0.6951
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 96ms/step - accuracy: 0.5601 - loss: 0.6744
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 102ms/step - accuracy: 0.6660 - loss: 0.5522
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 104ms/step - accuracy: 0.7240 - loss: 0.4605
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 104ms/step - accuracy: 0.6383 - loss: 0.6506
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 87ms/step - accuracy: 0.5034 - loss: 0.7185
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 98ms/step - accuracy: 0.5065 - loss: 0.7115
Epoch 9/10
[1m125/

# pad_sequence의 truncating과 padding을 pre로 바꾸기

In [None]:
from tensorflow.keras.datasets import imdb 

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000)
(X_train, y_train), (X_test, y_test)

#  train, test 데이터 전처리

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=80, padding='pre', truncating='pre')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='pre', truncating='pre')

In [61]:
# RNN 모델 
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=10000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.6802 - loss: 0.5599
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.8639 - loss: 0.3216
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 49ms/step - accuracy: 0.9050 - loss: 0.2383
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.9527 - loss: 0.1273
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.9843 - loss: 0.0472
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.9925 - loss: 0.0226
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.9949 - loss: 0.0154
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 52ms/step - accuracy: 0.9927 - loss: 0.0209
Epoch 9/10
[1m125/125[0m [32

# RNN 층 (뉴런 128개)을 하나 더 추가 해보기

In [62]:
from tensorflow.keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 10000)
(X_train, y_train), (X_test, y_test)

#  train, test 데이터 전처리

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=80, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='post', truncating='post')

In [64]:
# RNN 모델 
from tensorflow.keras import Sequential, layers

model_rnn=Sequential(
    [
    layers.Input(shape=(80,)),
    layers.Embedding(input_dim=1000, output_dim=32), # (80,32) 행렬
    # layers.Flatten(), # 1차원으로 평탄화
    # layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64, return_sequences=True),
    layers.SimpleRNN(128, return_sequences=True),
    layers.SimpleRNN(128),
    layers.Dense(2, activation='softmax')
    ]
)
# rnn 모델 compile
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
# evaluate
rnn_loss, rnn_accuracy = model_rnn.evaluate(X_test_pad, y_test)
print(f"Loss: {rnn_loss:.4f}, Accuacy : {rnn_accuracy:.4f}")

Epoch 1/10


2025-09-04 16:29:24.928913: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 71ms/step - accuracy: 0.4984 - loss: 0.7187
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 62ms/step - accuracy: 0.5009 - loss: 0.7122
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 64ms/step - accuracy: 0.4992 - loss: 0.7071
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 64ms/step - accuracy: 0.5070 - loss: 0.7000
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 66ms/step - accuracy: 0.5432 - loss: 0.6876
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.7014 - loss: 0.5897
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 66ms/step - accuracy: 0.7021 - loss: 0.5881
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 64ms/step - accuracy: 0.7410 - loss: 0.5323
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━

2025-09-04 16:30:53.420446: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.7324 - loss: 0.5368
Loss: 0.5368, Accuacy : 0.7324


| 번호 | 실험 조건                                              | Loss   | Accuracy          |
| -- | -------------------------------------------------- | ------ | ----------------- |
| 0  | 비교군                                                | 3.0997 | 0.5097            |
| 1  | Optimizer: SGD                                     | 0.6958 | 0.5080            |
| 2  | 전체 단어 개수 = 1000                                    | 0.9225 | 0.7269            |
| 3  | 영화평 길이 = 200                                       | 0.6928 | 0.5038            |
| 4  | pad\_sequences `truncating='pre'`, `padding='pre'` | 1.2778 | 0.7986 (**과대적합**) |
| 5  | RNN 층 (뉴런 128) 추가                                  | 0.5368 | 0.7324            |
| 6  | CNN                                                  | 0.8909 | 0.7848              |

# cnn 으로 구성

In [66]:
from tensorflow.keras import Sequential, layers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

# -------------------------------
# 1. 데이터 불러오기
# -------------------------------
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# -------------------------------
# 2. 시퀀스 전처리 (패딩)
# -------------------------------
X_train_pad = pad_sequences(X_train, maxlen=80, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test, maxlen=80, padding='post', truncating='post')

# -------------------------------
# 3. CNN 모델 정의
# -------------------------------
model_cnn = Sequential([
    layers.Input(shape=(80,)),                        # 시퀀스 길이 80
    layers.Embedding(input_dim=10000, output_dim=32), # (batch, 80, 32)

    layers.Conv1D(64, 3, activation='relu'),          # 합성곱 (n-gram 특징 추출)
    layers.GlobalMaxPooling1D(),                      # 가장 중요한 특징만 추출

    layers.Dense(64, activation='relu'),              # 은닉층
    layers.Dense(2, activation='softmax')             # 최종 출력 (긍정/부정 분류)
])

# -------------------------------
# 4. 컴파일 & 학습
# -------------------------------
model_cnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

history = model_cnn.fit(X_train_pad, y_train,
                        epochs=10,
                        batch_size=200,
                        validation_split=0.2,
                        verbose=1)

# -------------------------------
# 5. 평가
# -------------------------------
cnn_loss, cnn_accuracy = model_cnn.evaluate(X_test_pad, y_test)
print(f"Loss: {cnn_loss:.4f}, Accuracy: {cnn_accuracy:.4f}")


Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.6593 - loss: 0.6353 - val_accuracy: 0.7528 - val_loss: 0.5257
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8105 - loss: 0.4268 - val_accuracy: 0.7970 - val_loss: 0.4276
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8930 - loss: 0.2693 - val_accuracy: 0.8042 - val_loss: 0.4329
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9521 - loss: 0.1458 - val_accuracy: 0.8022 - val_loss: 0.5083
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9872 - loss: 0.0607 - val_accuracy: 0.8052 - val_loss: 0.5753
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9977 - loss: 0.0218 - val_accuracy: 0.8064 - val_loss: 0.6473
Epoch 7/10
[1m100/100[0m 