<a href="https://colab.research.google.com/github/serereuk/Information_retrieval/blob/master/textCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd '/content/drive/My Drive/information/CNN, RNN/CNN/rt-polaritydata(MR)/rt-polaritydata'

In [0]:
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, GlobalMaxPooling1D, ReLU

In [0]:
def Reader(text_file_path):
    data = [];
    for i in text_file_path:
        data += open(i, 'r', encoding="ISO-8859-1").readlines()
    label = [0] * int(len(data)/2) + [1] * int(len(data)/2)
    for i in range(len(data)):
        data[i] = re.sub('^\W+', "", data[i].lower().strip())
    return data, label

In [0]:
def Indexing_padding(data, tok=None):
    if tok == None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        tokenizer.fit_on_texts(data)
    else:
        tokenizer = tok
    tensor = tokenizer.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [0]:
data, y = Reader(['rt-polarity.neg.txt', 'rt-polarity.pos.txt'])
tensor, tokenizer = Indexing_padding(data, None)
x_train, x_val, y_train, y_val = train_test_split(tensor, y, test_size=0.1, stratify=y, random_state = 0)

In [0]:
Buffer_size = len(x_train) + 1
vocab_size = len(tokenizer.word_index) + 1
Batch_size = 50
epoch = 10
learning_rate = 0.001
dimension = 300
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(Buffer_size)
train_dataset = train_dataset.batch(Batch_size, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(Buffer_size)
test_dataset = test_dataset.batch(Batch_size, drop_remainder=True)

In [0]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [0]:
class textCNN(Model):

    def __init__(self, vocab_size, dimension=300):
        super(textCNN, self).__init__()
        self.embed = Embedding(vocab_size, dimension)
        self.conv1 = Conv1D(100, 3, strides=1, padding='valid', kernel_regularizer=l2(3))
        self.conv2 = Conv1D(100, 4, strides=1, padding='valid', kernel_regularizer=l2(3))
        self.conv3 = Conv1D(100, 5, strides=1, padding='valid', kernel_regularizer=l2(3))
        self.dropout = Dropout(0.5)
        self.dense = Dense(2, activation='softmax', kernel_regularizer=l2(3))

    def call(self, x):
        x = self.embed(x)
        filter_result1 = ReLU()(GlobalMaxPooling1D()(self.conv1(x)))
        filter_result2 = ReLU()(GlobalMaxPooling1D()(self.conv2(x)))
        filter_result3 = ReLU()(GlobalMaxPooling1D()(self.conv3(x)))
        x = tf.concat([filter_result1, filter_result2, filter_result3], axis=1)
        x = self.dense(self.dropout(x))
        return x

model = textCNN(vocab_size, dimension)

In [0]:
@tf.function
def train_step(x, label):
    with tf.GradientTape()as tape:
        predictions = model(x)
        loss = loss_object(label, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(label, predictions)

In [0]:
@tf.function
def test_step(x, label):
    predictions = model(x)
    loss = loss_object(label, predictions)
    test_loss(loss)
    test_accuracy(label, predictions)

In [13]:
for ep in range(epoch):
    for x, label in train_dataset:
        train_step(x,label)
    for x, label in test_dataset:
        test_step(x, label)
    template = '에포크: {}, 손실: {}, 정확도: {}, 테스트 손실: {}, 테스트 정확도: {}'
    print (template.format(ep+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         test_loss.result(),
                         test_accuracy.result()*100))

에포크: 1, 손실: 0.6205986142158508, 정확도: 66.15706634521484, 테스트 손실: 0.5748421549797058, 테스트 정확도: 71.61904907226562
에포크: 2, 손실: 0.5290416479110718, 정확도: 77.07853698730469, 테스트 손실: 0.5633213520050049, 테스트 정확도: 73.0
에포크: 3, 손실: 0.4716743528842926, 정확도: 83.37870788574219, 테스트 손실: 0.5577959418296814, 테스트 정확도: 73.74603271484375
에포크: 4, 손실: 0.4372136890888214, 정확도: 87.06282806396484, 테스트 손실: 0.556642472743988, 테스트 정확도: 73.83333587646484
에포크: 5, 손실: 0.41484883427619934, 정확도: 89.43036651611328, 테스트 손실: 0.5543820858001709, 테스트 정확도: 74.11428833007812
에포크: 6, 손실: 0.39944005012512207, 정확도: 91.04886627197266, 테스트 손실: 0.55344158411026, 테스트 정확도: 74.38095092773438
에포크: 7, 손실: 0.38819611072540283, 정확도: 92.22587585449219, 테스트 손실: 0.5528042912483215, 테스트 정확도: 74.51700592041016
에포크: 8, 손실: 0.3796839714050293, 정확도: 93.11387634277344, 테스트 손실: 0.5522848963737488, 테스트 정확도: 74.66666412353516
에포크: 9, 손실: 0.373024582862854, 정확도: 93.8068618774414, 테스트 손실: 0.5521225333213806, 테스트 정확도: 74.7513198852539
에포크: 10, 손실: 0.36