In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import pickle

# Đọc dữ liệu
df = pd.read_csv('merged_dataset.csv')

def clean_text(text):
    text = str(text).lower().strip()
    text = text.replace("_", " ")
    return text

df['text'] = df['text'].astype(str).apply(clean_text)

X = df['text'].values
y = df['sentiment'].astype(int).values
num_classes = len(np.unique(y))
y = to_categorical(y, num_classes=num_classes)

# Khởi tạo và fit Tokenizer
tokenizer = Tokenizer(num_words=15000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
vocab_size = min(15000, len(tokenizer.word_index) + 1)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=70, padding='post', truncating='post')
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Tính trọng số lớp
class_weights = compute_class_weight('balanced', classes=np.unique(df['sentiment']), y=df['sentiment'])
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

# Xây dựng mô hình
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Flatten, Dropout, SpatialDropout1D

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=70),
    SpatialDropout1D(0.3),
    LSTM(64, return_sequences=True),
    Conv1D(64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.build(input_shape=(None, 70))
model.summary()

# Biên dịch mô hình
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=0.0001)

# Huấn luyện mô hình
epochs = 10
batch_size = 32
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, reduce_lr],
    class_weight=class_weight_dict,
    verbose=1
)

print("✅ Huấn luyện hoàn tất!")

# Lưu mô hình thành file .h5
model.save("lstm_cnn_sentiment_model.h5")
print("✅ Mô hình đã được lưu thành lstm_cnn_sentiment_model.h5")

# Lưu tokenizer thành file .pkl
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("✅ Tokenizer đã được lưu thành tokenizer.pkl")

Class weights: {0: 0.36182541231579973, 1: 1.9874698472259449, 2: 1.585810521813516, 3: 9.756907894736843}




Epoch 1/10
[1m742/742[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 45ms/step - accuracy: 0.3446 - loss: 1.2624 - val_accuracy: 0.6661 - val_loss: 0.8745 - learning_rate: 0.0010
Epoch 2/10
[1m742/742[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 51ms/step - accuracy: 0.6760 - loss: 0.6811 - val_accuracy: 0.6759 - val_loss: 0.7838 - learning_rate: 0.0010
Epoch 3/10
[1m742/742[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 47ms/step - accuracy: 0.7545 - loss: 0.4919 - val_accuracy: 0.7379 - val_loss: 0.6413 - learning_rate: 0.0010
Epoch 4/10
[1m742/742[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 47ms/step - accuracy: 0.7794 - loss: 0.4144 - val_accuracy: 0.6604 - val_loss: 0.7957 - learning_rate: 0.0010
Epoch 5/10
[1m742/742[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 50ms/step - accuracy: 0.8023 - loss: 0.3357 - val_accuracy: 0.7227 - val_loss: 0.7402 - learning_rate: 2.0000e-04




✅ Huấn luyện hoàn tất!
✅ Mô hình đã được lưu thành lstm_cnn_sentiment_model.h5
✅ Tokenizer đã được lưu thành tokenizer.pkl
