In [47]:
import os
import pandas as pd

# Load input_data_dfs from CSV files
input_data_dfs = {}

input_data_dir = '../Gen_sen/data_pre/input_data'
for filename in os.listdir(input_data_dir):
    if filename.endswith('_input_data.csv'):
        hotel_name = filename.replace('_input_data.csv', '')
        input_data_dfs[hotel_name] = pd.read_csv(os.path.join(input_data_dir, filename))

print("Hotel data and input data loaded successfully.")

Hotel data and input data loaded successfully.


In [48]:
from sklearn.model_selection import train_test_split

train_data_per_hotel = {}
test_data_per_hotel = {}

for hotel_name, df in input_data_dfs.items():
    # Split the dataset into training and testing sets
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        df['cleaned_content'], 
        df['label'], 
        test_size=0.3,
        stratify=df['label'], 
        random_state=42
    )
    train_data_per_hotel[hotel_name] = (X_train_raw, y_train)
    test_data_per_hotel[hotel_name] = (X_test_raw, y_test)

In [49]:
from sklearn.utils import shuffle

# Combine all training data
combined_X_train = []
combined_y_train = []

for hotel_name, (X_train_raw, y_train) in train_data_per_hotel.items():
    combined_X_train.extend(X_train_raw)
    combined_y_train.extend(y_train)

# Combine all testing data
combined_X_test = []
combined_y_test = []

for hotel_name, (X_test_raw, y_test) in test_data_per_hotel.items():
    combined_X_test.extend(X_test_raw)
    combined_y_test.extend(y_test)

print("Combined training and testing data successfully.")

combined_X_train, combined_y_train = shuffle(combined_X_train, combined_y_train, random_state=42)
combined_X_test, combined_y_test = shuffle(combined_X_test, combined_y_test, random_state=42)

print("Data shuffled successfully.")

Combined training and testing data successfully.
Data shuffled successfully.


In [50]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def prepare_data(data, tokenizer, maxlen=100):
    sequences = tokenizer.texts_to_sequences(data)
    padded_sequences = pad_sequences(sequences, maxlen=maxlen)
    return padded_sequences

# Constant
NUM_WORDS = 5000
MAX_LENGTH = 100

tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(combined_X_train)

# Prepare training data
padded_sequences = prepare_data(combined_X_train, tokenizer, MAX_LENGTH)
train_data = (padded_sequences, np.array(combined_y_train))

print(train_data[0].shape)
print(train_data[1].shape)
# Prepare testing data
padded_sequences_test = prepare_data(combined_X_test, tokenizer)
test_data = (padded_sequences_test, np.array(combined_y_test))

(5603, 100)
(5603,)


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Flatten

# Build the CNN model
def build_model(vocab_size, embedding_dim, num_classes, kernel_size=3, filters=64, lstm_units=64, dropout_rate=0.5, l2_lambda=0.1):
    model = Sequential()
    # Embedding Layer
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
    # Convolutional layer
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', kernel_regularizer=l2(l2_lambda)))
    # Batch normalization layer
    model.add(BatchNormalization())
    # Max pooling layer
    model.add(MaxPooling1D(pool_size=2))
    # Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(lstm_units, kernel_regularizer=l2(l2_lambda))))
    # Fully connected layer with ReLU activation and L2 regularization
    model.add(Dense(lstm_units, activation='relu', kernel_regularizer=l2(l2_lambda)))
    # Batch normalization layer
    model.add(BatchNormalization())
    # Dropout layer for regularization
    model.add(Dropout(dropout_rate))
    # Output layer with softmax activation for classification
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    return model

def build_cnn_model(vocab_size, num_classes, embedding_dim=128, conv_filters=128, kernel_size=5, dense_units=64, dropout_rate=0.5):
    model = Sequential()
    # Embedding layer to represent each word with a vector
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
    # 1D Convolutional layer with ReLU activation
    model.add(Conv1D(conv_filters, kernel_size, activation='relu'))
    # Global max pooling to reduce dimensionality
    model.add(GlobalMaxPooling1D())
    # Dense layer with ReLU activation
    model.add(Dense(dense_units, activation='relu'))
    # Dropout layer to prevent overfitting
    model.add(Dropout(dropout_rate))
    # Output layer with softmax for multi-class classification
    model.add(Dense(num_classes, activation='softmax'))
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

def build_bilstm_model(vocab_size, embedding_dim, num_classes, lstm_units=128, dropout_rate=0.5):
    model = Sequential()
    # Embedding Layer
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim))
    # BiLSTM Layer
    model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=False)))
    # Dropout for regularization
    model.add(Dropout(dropout_rate))
    # Dense Layer
    model.add(Dense(128, activation='relu'))
    # Output Layer
    model.add(Dense(num_classes, activation='softmax'))
    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    return model

In [52]:
# Model Constants
VOCAB_SIZE = NUM_WORDS
EMBEDDING_DIM = MAX_LENGTH
NUM_CLASSES = 3

# Train Constants
EPOCHS = 10
BATCH_SIZE = 32

# Get the training data
X_train, y_train = train_data

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Calculate class weights based on the training labels
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Convert class weights to a dictionary
class_weight_dict = dict(enumerate(class_weights))

# Early stopping
stop_early = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

models = {}
histories = {}

In [53]:
# Instantiate model
cnn_bilstm_model = build_model(vocab_size=VOCAB_SIZE, 
                    embedding_dim=EMBEDDING_DIM, 
                    num_classes=NUM_CLASSES)
# Train the model
history = cnn_bilstm_model.fit(
    X_train, 
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE, 
    validation_data=(X_val, y_val), 
)
print("Model trained successfully.")

models['cnn_bilstm'] = cnn_bilstm_model
histories['cnn_bilstm'] = history

Epoch 1/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 45ms/step - accuracy: 0.4989 - loss: 24.8123 - val_accuracy: 0.8055 - val_loss: 4.1809
Epoch 2/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8507 - loss: 2.6568 - val_accuracy: 0.8055 - val_loss: 0.9816
Epoch 3/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.8819 - loss: 0.5810 - val_accuracy: 0.8055 - val_loss: 0.6462
Epoch 4/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9083 - loss: 0.3370 - val_accuracy: 0.8055 - val_loss: 0.7173
Epoch 5/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 40ms/step - accuracy: 0.9186 - loss: 0.2761 - val_accuracy: 0.8234 - val_loss: 0.5285
Epoch 6/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step - accuracy: 0.9219 - loss: 0.2606 - val_accuracy: 0.8145 - val_loss: 0.6167
Epoch 7/10
[1m141/1

In [73]:
cnn_model = build_cnn_model2(vocab_size=VOCAB_SIZE, 
                            embedding_dim=EMBEDDING_DIM, 
                            num_classes=NUM_CLASSES)

print(X_val.shape)
print(y_val.shape)
print(X_train.shape)
print(y_train.shape)
 #Train the model
cnn_history = cnn_model.fit(
    X_train, 
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE, 
    validation_data=(X_val, y_val), 
)
print("CNN model trained successfully.")

models['cnn'] = cnn_model
histories['cnn'] = cnn_history

(1121, 100)
(1121,)
(4482, 100)
(4482,)
Epoch 1/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.7769 - loss: 0.7226 - val_accuracy: 0.8055 - val_loss: 0.4914
Epoch 2/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8207 - loss: 0.4442 - val_accuracy: 0.8323 - val_loss: 0.4214
Epoch 3/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.8807 - loss: 0.2996 - val_accuracy: 0.8260 - val_loss: 0.4390
Epoch 4/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9238 - loss: 0.2184 - val_accuracy: 0.8260 - val_loss: 0.4907
Epoch 5/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9440 - loss: 0.1562 - val_accuracy: 0.8100 - val_loss: 0.5680
Epoch 6/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9641 - loss: 0.1027 - val_accuracy: 0.8082 -

In [66]:
bilstm_model = build_bilstm_model(vocab_size=VOCAB_SIZE, 
                                  embedding_dim=EMBEDDING_DIM, 
                                  num_classes=NUM_CLASSES)

# Train the model
bilstm_model = bilstm_model.fit(
    X_train, 
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE, 
    validation_data=(X_val, y_val), 
)
print("BiLSTM model trained successfully.")

models['bilstm'] = bilstm_model
histories['bilstm'] = bilstm_model

(1121, 100)
(1121,)
Epoch 1/10
[1m 79/141[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m5s[0m 88ms/step - accuracy: 0.7594 - loss: 0.7028

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def visualize_history(history, model_name='Model'):
    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'{model_name} accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'{model_name} loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.show()

for model, history in histories.items():
    visualize_history(history, model)


X_test, y_test = test_data
for name, model in models.items():
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
    print(f'{name} test accuracy: {test_accuracy:.4f}')
    print(f'{name} test loss: {test_loss:.4f}')

In [None]:
from sklearn.metrics import classification_report

X_test, y_test = test_data

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    print(f'Classification Report for {name}:')
    print(classification_report(y_test, y_pred_classes, target_names=['Negative', 'Neutral', 'Positive']))