In [83]:
import tensorflow as tf
import pandas as pd
import numpy as np
from string import digits
from collections import Counter
from pyvi import ViTokenizer
from gensim.models.word2vec import Word2Vec
from tensorflow.keras.utils import to_categorical

In [84]:
data_train = pd.read_csv("vlsp_sentiment_train.csv", sep='\t')
data_train.columns =['Class', 'Data']
data_test = pd.read_csv("vlsp_sentiment_test.csv", sep='\t')
data_test.columns =['Class', 'Data']

In [85]:
data_train.head()

Unnamed: 0,Class,Data
0,-1,"Mình đã dùng anywhere thế hệ đầu, quả là đầy t..."
1,-1,"Quan tâm nhất là độ trễ có cao không, dùng thi..."
2,-1,"dag xài con cùi bắp 98k....pin trâu, mỗi tội đ..."
3,-1,logitech chắc hàng phải tiền triệu trở lên dùn...
4,-1,"Đang xài con m175 cùi mía , nhà xài nhiều chuộ..."


In [86]:
data_train = data_train.sample(frac=1, random_state=42)

In [87]:
data_train.head()

Unnamed: 0,Class,Data
4039,0,"tuy có sự sáng tạo , nhưng cần phải có phong c..."
3815,0,khoảng 3-4s j đó
848,-1,Chiều dài 45cm :( bỏ vào túi kiểu gì
4863,0,"không , không nên mua . mua samsung ngon hơn ."
79,-1,"thế thì quất thôi, chứ con miband 1s của e bên..."


In [88]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

In [89]:
encoded_labels = []

for label in labels:
    if label == -1:
        encoded_labels.append([1,0,0])
    elif label == 0:
        encoded_labels.append([0,1,0])
    else:
        encoded_labels.append([0,0,1])

encoded_labels = np.array(encoded_labels)

In [90]:
for i in range(5):
    print(encoded_labels[i])

[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]


In [91]:
reviews_processed = []
unlabeled_processed = []
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in digits])
    reviews_processed.append(review_cool_one)

In [92]:
#Use PyVi for Vietnamese word tokenizer
word_reviews = []
all_words = []
for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review.split())


In [93]:
word_reviews[0]

['tuy',
 'có',
 'sự',
 'sáng_tạo',
 ',',
 'nhưng',
 'cần',
 'phải',
 'có',
 'phong_cách',
 'riêng',
 ',',
 'đừng',
 'chạy',
 'theo',
 'iphone',
 ',',
 'samsung',
 'rất',
 'cố_gắng',
 ',',
 'biết',
 'nắm_bắt',
 'nhu_cầu',
 'khách_hàng',
 '(',
 's',
 's',
 'edge',
 'người',
 'châu',
 'á',
 'rất',
 'chuộng',
 ',',
 'nhưng',
 'ko',
 'thấy',
 'phát_triển',
 'nữa',
 ')',
 's',
 'là',
 'sự',
 'hoàn_thiện',
 'của',
 's',
 ',',
 'nhưng',
 'sfan',
 'thì',
 'luôn',
 'gato',
 ',',
 'vì',
 'các',
 'bạn',
 'ấy',
 'thấy',
 'iphone',
 'quá',
 'đắt',
 'và',
 'các',
 'bạn',
 'ấy',
 'chuẩn_bị',
 'lên_tiếng',
 '.']

In [94]:
EMBEDDING_DIM = 400 # how big is each word vector
MAX_VOCAB_SIZE = 10000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 300 # max number of words in a comment to use

In [95]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [96]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index


In [97]:
# word_index[1]

In [98]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [99]:
data

array([[   0,    0,    0, ...,  950, 2022,    1],
       [   0,    0,    0, ...,   60,  309,   62],
       [   0,    0,    0, ...,  726,  310,   43],
       ...,
       [   0,    0,    0, ...,   11,  434, 1036],
       [   0,    0,    0, ..., 4142,    6,  158],
       [   0,    0,    0, ...,   33,   33,    1]])

In [100]:
print('Shape of X train and X validation tensor:',data.shape)
print('Shape of label train and validation tensor:', labels.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


In [101]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('vi-model-CBOW.bin', binary=True)


vocabulary_size=min(len(word_index)+1,MAX_VOCAB_SIZE)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [102]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, BatchNormalization, AveragePooling1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout,concatenate
from tensorflow.keras.layers import Reshape, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

In [103]:
sequence_length = data.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

conv_0 = Conv1D(num_filters, filter_sizes[0], activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding)
maxpool_0 = GlobalMaxPooling1D()(conv_0)

conv_1 = Conv1D(num_filters, filter_sizes[1], activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding)
maxpool_1 = GlobalMaxPooling1D()(conv_1)

conv_2 = Conv1D(num_filters, filter_sizes[2], activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding)
maxpool_2 = GlobalMaxPooling1D()(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
dropout1 = Dropout(drop)(flatten)
output = Dense(units=3, activation='softmax', kernel_regularizer=regularizers.l2(0.01))(dropout1)

model = Model(inputs, output)
print(model)

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=10, verbose=1)

<keras.src.engine.functional.Functional object at 0x000001CA970BBB10>
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 300)]                0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 300, 400)             3167600   ['input_4[0][0]']             
                                                                                                  
 conv1d_9 (Conv1D)           (None, 298, 100)             120100    ['embedding_3[0][0]']         
                                                                                                  
 conv1d_10 (Conv1D)          (None, 297, 100)             160100    ['embedding_3[0][0]']         
                      

In [104]:
checkpoint = ModelCheckpoint('cnn.keras',
                             monitor='val_accuracy',
                             save_best_only=True, mode='max')
callbacks_list = [checkpoint, early_stopping]

model.fit(data, labels, validation_split=0.2,
          epochs=50, batch_size=256, callbacks=callbacks_list, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1ca9705b690>

In [105]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

In [106]:
encoded_labels_test = []

for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1,0,0])
    elif label_test == 0:
        encoded_labels_test.append([0,1,0])
    else:
        encoded_labels_test.append([0,0,1])

encoded_labels_test = np.array(encoded_labels_test)

In [107]:
reviews_processed_test = []
unlabeled_processed_test = []
for review_test in reviews_test:
    # review_cool_one = ''.join([char for char in review_test if char not in digits])
    review_cool_one = ''.join([char for char in review_test])
    reviews_processed_test.append(review_cool_one)

In [108]:
#Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [109]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [110]:
print('Shape of X train and X validation tensor:',data_test.shape)
print('Shape of label train and validation tensor:', labels_test.shape)

Shape of X train and X validation tensor: (1050, 300)
Shape of label train and validation tensor: (1050, 3)


In [111]:
# model = load_model('best_model.keras')
score = model.evaluate(data_test, labels_test)



In [112]:
print("%s: %.2f%%" % (model.metrics_names[0], score[0]*100))
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))


loss: 103.89%
accuracy: 69.52%
