In [1]:
#합성곱망 CNN1D를 이용한 OIIMDB감정분류
import pandas as pd
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

from sklearn.metrics import accuracy_score,classification_report

In [2]:
# 파라미터 설정:
max_features = 6000 # 추출할 단어의 수
max_length = 400 #각 문장의 최대 길이

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(len(x_train), 'train observations')
print(len(x_test), 'test observations')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train observations
25000 test observations


In [5]:
# 단어 대 숫자 매핑 생성 
wind = imdb.get_word_index()
revind = dict((v,k) for k,v in wind.items())
print (x_train[0])
print (y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


In [6]:
# 역 매핑된 Dictionaryfmf 사용하여 디코딩 수행
def decode(sent_list):
    new_words = []
    for i in sent_list:
        new_words.append(revind[i])
    comb_words = " ".join(new_words)
    return comb_words  

print (decode(x_train[0]))

the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at barrel but when from one bit then have two

In [7]:
# 효율적인 연산을 위한 패드 배열
#각문장 길이를 일정한 MAX_length로 맞춤
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [8]:
# 딥러닝 아키텍쳐 파라미터
batch_size = 32
embedding_dims = 60
num_kernels = 260
kernel_size = 3
hidden_dims = 300
epochs = 3

In [9]:
# 모델 구축
model = Sequential()

model.add(Embedding(max_features,embedding_dims,input_length=max_length))
model.add(Dropout(0.2))

model.add(Conv1D(num_kernels,kernel_size,padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print (model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 60)           360000    
_________________________________________________________________
dropout (Dropout)            (None, 400, 60)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 398, 260)          47060     
_________________________________________________________________
global_max_pooling1d (Global (None, 260)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               78300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
activation (Activation)      (None, 300)               0

In [10]:
model.fit(x_train, y_train,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x202422747f0>

In [14]:

y_train_predclass = np.round(model.predict(x_train)).astype(int)
y_test_predclass = np.round(model.predict(x_test)).astype(int)

y_train_predclass.shape = y_train.shape
y_test_predclass.shape = y_test.shape


In [15]:
y_test_predclass

array([0, 1, 1, ..., 0, 0, 0])

In [16]:
y_test

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [17]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [18]:
y_train_predclass

array([1, 0, 0, ..., 0, 1, 0])

In [19]:
print (("\n\nCNN 1D  - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3)))
print ("\nCNN 1D of Training data\n",classification_report(y_train, y_train_predclass))
print ("\nCNN 1D - Train Confusion Matrix\n\n",pd.crosstab(y_train, y_train_predclass,rownames = ["Actuall"],colnames = ["Predicted"]))      

print (("\nCNN 1D  - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3)))
print ("\nCNN 1D of Test data\n",classification_report(y_test, y_test_predclass))
print ("\nCNN 1D - Test Confusion Matrix\n\n",pd.crosstab(y_test, y_test_predclass,rownames = ["Actuall"],colnames = ["Predicted"]))  



CNN 1D  - Train accuracy: 0.966

CNN 1D of Training data
               precision    recall  f1-score   support

           0       0.97      0.96      0.97     12500
           1       0.96      0.97      0.97     12500

    accuracy                           0.97     25000
   macro avg       0.97      0.97      0.97     25000
weighted avg       0.97      0.97      0.97     25000


CNN 1D - Train Confusion Matrix

 Predicted      0      1
Actuall                
0          11972    528
1            330  12170

CNN 1D  - Test accuracy: 0.888

CNN 1D of Test data
               precision    recall  f1-score   support

           0       0.90      0.87      0.89     12500
           1       0.87      0.91      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


CNN 1D - Test Confusion Matrix

 Predicted      0      1
Actuall                
0          10881   