In [1]:
import os, re, random
import pandas as np
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, Conv1D, MaxPool1D, Dropout, concatenate


2024-08-02 17:26:48.028401: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
neg_docs = [
    re.sub(r"[^a-zA-Z\s]","",open(f"./txt_sentoken/neg/{file}").read())
    for file in os.listdir('./txt_sentoken/neg/')
]
pos_docs = [
    re.sub(r"[^a-zA-Z\s]","",open(f"./txt_sentoken/pos/{file}").read())
    for file in os.listdir('./txt_sentoken/pos/')
]

In [3]:
stop_words = [
    re.sub(r"[^a-zA-Z\s]","",word) 
    for word in stopwords.words('english')
]

In [4]:
neg_words = [
    list(filter(
        lambda x: (x not in stop_words and len(x) > 1),
        re.findall(r'\S+', doc)
    ))
    for doc in neg_docs
]
pos_words = [
    list(filter(
        lambda x: (x not in stop_words and len(x) > 1),
        re.findall(r'\S+', doc)
    ))
    for doc in pos_docs
]

In [5]:
#random.shuffle(neg_words)
#random.shuffle(pos_words)
len(neg_words)

1000

In [6]:
split_point = 800
neg_len = len(neg_words)
pos_len = len(pos_words)
train_x = pos_words[:split_point] + neg_words[:split_point]
train_y = [1 for _ in range(split_point)] + [0 for _ in range(split_point)]
test_x = pos_words[split_point:] + neg_words[split_point:]
test_y = [1 for _ in range(pos_len-split_point)] + [0 for _ in range(neg_len-split_point)]

In [7]:
tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(train_x)
encoded_train = tokenizer_train.texts_to_sequences(train_x)
tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(test_x)
encoded_test = tokenizer_test.texts_to_sequences(test_x)

In [8]:
import pickle
with open('tokenizer_train.h5', 'wb') as f:
    pickle.dump(tokenizer_train, f)
with open('tokenizer_test.h5', 'wb') as f:
    pickle.dump(tokenizer_test, f)

In [9]:
dataset_maxlen = 0
for sen in pos_words + neg_words:
    if len(sen) > dataset_maxlen:
        dataset_maxlen = len(sen)
padded_train = pad_sequences(encoded_train, maxlen=dataset_maxlen, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=dataset_maxlen, padding='post')

In [10]:
vocab_len = len(tokenizer_train.word_index)+1
vocab_len

42053

In [11]:
model = Sequential()

model.add(Embedding(vocab_len, 100, input_length=dataset_maxlen))
model.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPool1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))



In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.summary()

In [14]:
padded_train.shape

(1600, 1374)

In [15]:
len(padded_train)

1600

In [16]:
import tensorflow as tf

In [17]:
model.fit(
    tf.convert_to_tensor(padded_train), 
    tf.convert_to_tensor(train_y), 
    epochs=4, 
    batch_size=20, 
    validation_data=(
        tf.convert_to_tensor(padded_test),
        tf.convert_to_tensor(test_y)
    )
)

Epoch 1/4
[1m 2/80[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7s[0m 94ms/step - accuracy: 0.4875 - loss: 0.6979

2024-08-02 17:26:54.747982: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28516800 exceeds 10% of free system memory.
2024-08-02 17:26:54.846109: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28516800 exceeds 10% of free system memory.
2024-08-02 17:26:54.938998: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28516800 exceeds 10% of free system memory.


[1m 5/80[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 93ms/step - accuracy: 0.4832 - loss: 0.7054

2024-08-02 17:26:55.042582: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28516800 exceeds 10% of free system memory.
2024-08-02 17:26:55.123720: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28516800 exceeds 10% of free system memory.


[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - accuracy: 0.5140 - loss: 0.7029 - val_accuracy: 0.5675 - val_loss: 0.6827
Epoch 2/4
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 69ms/step - accuracy: 0.8177 - loss: 0.5216 - val_accuracy: 0.5275 - val_loss: 0.7054
Epoch 3/4
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 70ms/step - accuracy: 0.9957 - loss: 0.0541 - val_accuracy: 0.5325 - val_loss: 0.7514
Epoch 4/4
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 69ms/step - accuracy: 1.0000 - loss: 0.0050 - val_accuracy: 0.5325 - val_loss: 0.7768


<keras.src.callbacks.history.History at 0x7fe0620d9580>

In [18]:
model.save('textcnn.h5')



In [19]:
from tensorflow.keras.models import load_model

In [20]:
model = load_model('textcnn.h5')

