<a href="https://colab.research.google.com/github/serereuk/Information_retrieval/blob/master/textCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%tensorflow_version 2.x

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
%cd '/content/drive/My Drive/information/CNN, RNN/CNN/rt-polaritydata(MR)/rt-polaritydata'

/content/drive/My Drive/information/CNN, RNN/CNN/rt-polaritydata(MR)/rt-polaritydata


In [0]:
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, GlobalMaxPooling1D, ReLU

In [0]:
def Reader(text_file_path):
    data = [];
    for i in text_file_path:
        data += open(i, 'r', encoding="ISO-8859-1").readlines()
    label = [0] * int(len(data)/2) + [1] * int(len(data)/2)
    for i in range(len(data)):
        data[i] = re.sub('^\W+', "", data[i].lower().strip())
    return data, label

In [0]:
def Indexing_padding(data, tok=None):
    if tok == None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        tokenizer.fit_on_texts(data)
    else:
        tokenizer = tok
    tensor = tokenizer.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [0]:
data, y = Reader(['rt-polarity.neg.txt', 'rt-polarity.pos.txt'])
tensor, tokenizer = Indexing_padding(data, None)
x_train, x_val, y_train, y_val = train_test_split(tensor, y, test_size=0.1, stratify=y, random_state=0)

In [0]:
Buffer_size = len(x_train) + 1
vocab_size = len(tokenizer.word_index) + 1
Batch_size = 50
epoch = 10
learning_rate = 0.001
dimension = 300
tf.random.set_seed(1)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(Buffer_size)
train_dataset = train_dataset.batch(Batch_size, drop_remainder=False)

test_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(Buffer_size)
test_dataset = test_dataset.batch(Batch_size, drop_remainder=False)

In [0]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [0]:
class textCNN(Model):

    def __init__(self, vocab_size, dimension=300):
        super(textCNN, self).__init__()
        self.embed = Embedding(vocab_size, dimension)
        self.conv1 = Conv1D(100, 3, strides=1, padding='valid', kernel_constraint=max_norm(3, axis=[0,1]))
        self.conv2 = Conv1D(100, 4, strides=1, padding='valid', kernel_constraint=max_norm(3, axis=[0,1]))
        self.conv3 = Conv1D(100, 5, strides=1, padding='valid', kernel_constraint=max_norm(3, axis=[0,1]))
        self.dense = Dense(2, activation='softmax', kernel_constraint=max_norm(3))

    def call(self, x):
        x = self.embed(x)
        filter_result1 = Dropout(0.5)(GlobalMaxPooling1D()(ReLU()(self.conv1(x))))
        filter_result2 = Dropout(0.5)(GlobalMaxPooling1D()(ReLU()(self.conv2(x))))
        filter_result3 = Dropout(0.5)(GlobalMaxPooling1D()(ReLU()(self.conv3(x))))
        x = tf.concat([filter_result1, filter_result2, filter_result3], axis=1)
        x = self.dense(x)
        return x

model = textCNN(vocab_size, dimension)

In [0]:
@tf.function
def train_step(x, label):
    with tf.GradientTape()as tape:
        predictions = model(x)
        loss = loss_object(label, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(label, predictions)

In [0]:
@tf.function
def test_step(x, label):
    predictions = model(x, training=False)
    loss = loss_object(label, predictions)
    test_loss(loss)
    test_accuracy(label, predictions)

In [26]:
for ep in range(epoch):
    for x, label in train_dataset:
        train_step(x,label)
    for x, label in test_dataset:
        test_step(x, label)
    template = '에포크: {}, 손실: {}, 정확도: {}, 테스트 손실: {}, 테스트 정확도: {}'
    print (template.format(ep+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         test_loss.result(),
                         test_accuracy.result()*100))

에포크: 1, 손실: 0.6140450239181519, 정확도: 67.8895263671875, 테스트 손실: 0.5566848516464233, 테스트 정확도: 73.66448211669922
에포크: 2, 손실: 0.5235644578933716, 정확도: 78.15007781982422, 테스트 손실: 0.5576726794242859, 테스트 정확도: 73.52389526367188
에포크: 3, 손실: 0.46739476919174194, 정확도: 84.16883850097656, 테스트 손실: 0.5549745559692383, 테스트 정확도: 74.03936767578125
에포크: 4, 손실: 0.43333637714385986, 정확도: 87.74882507324219, 테스트 손실: 0.555284321308136, 테스트 정확도: 74.203369140625
에포크: 5, 손실: 0.411521315574646, 정확도: 89.99896240234375, 테스트 손실: 0.555732786655426, 테스트 정확도: 74.24554443359375
에포크: 6, 손실: 0.39659932255744934, 정확도: 91.52857971191406, 테스트 손실: 0.5551154613494873, 테스트 정확도: 74.24242401123047
에포크: 7, 손실: 0.3857688903808594, 정확도: 92.63455963134766, 테스트 손실: 0.5548070073127747, 테스트 정확도: 74.30713653564453
에포크: 8, 손실: 0.37750542163848877, 정확도: 93.47837829589844, 테스트 손실: 0.5554026961326599, 테스트 정확도: 74.34395599365234
에포크: 9, 손실: 0.3710392713546753, 정확도: 94.13699340820312, 테스트 손실: 0.5560609698295593, 테스트 정확도: 74.23721313476562
에포크

In [0]:
model.summary()

Model: "text_cnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6412200   
_________________________________________________________________
conv1d (Conv1D)              multiple                  90100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  120100    
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  150100    
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  602       
Total params: 6,773,102
Trainable params: 6,773,102
Non-trainable params: 0
________________________________________________