## Cleaning and preprocessing of raw data

In [30]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.preprocessing import MinMaxScaler
import string
from nltk.corpus import stopwords
from tensorflow.keras.regularizers import l2

# for identifying english reviews
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Download necessary NLTK resources
# nltk.download('punkt')
# nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Base paths (for saving the cleaned and preprocessed datasets)
base_input_path = os.path.join('..', 'raw data')
base_output_path = os.path.join('..', 'datasets')

# Define directories for each dataset
directories = {
    'agoda_hotel_reviews': (1, 10),  
    'tripadvisor_hotel_reviews': (1, 5),
    'klook_hotel_reviews': (1, 5),  
    'booking_hotel_reviews': (1, 10)
}

# Hotel and folder mappings
output_folder_names = {
    'bai_hotel_reviews_2022_2024': '1_bai_hotel',
    'dusit_thani_mactan_reviews_2022_2024': '2_dusit_thani_mactan',
    'fairfield_by_marriott_cebu_reviews_2022_2024': '3_fairfield_by_marriott_cebu',
    'jpark_island_resort_and_waterpark_reviews_2022_2024': '4_jpark_island_resort_and_waterpark',
    'seda_ayala_center_cebu_reviews_2022_2024': '5_seda_ayala_center_cebu',
    'waterfront_hotel_and_casino_reviews_2022_2024': '6_waterfront_hotel_and_casino'
}

In [32]:
# # Function to load and normalize data
# def load_and_normalize_data():
#     all_reviews = []
#     for directory, (min_score, max_score) in directories.items():           
#         path = os.path.join('raw data', directory)
#         for filename in os.listdir(path):
#             if filename.endswith('.csv'):
#                 df = pd.read_csv(os.path.join(path, filename))
#                 scaler = MinMaxScaler(feature_range=(0, 1))
#                 df['normalized_score'] = scaler.fit_transform(df[['Review Score']])
#                 all_reviews.append(df)
#     return pd.concat(all_reviews, ignore_index=True)

# Function to load, normalize, and label data (utilized for saving data into datasets)
def load_and_normalize_data():
    all_reviews = []
    for directory, (min_score, max_score) in directories.items():
        path = os.path.join(base_input_path, directory)
        for filename in os.listdir(path):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(path, filename))
                
                # Normalize the 'Review Score' using the specific min and max score range for the dataset
                scaler = MinMaxScaler(feature_range=(0, 1))
                df['normalized_score'] = scaler.fit_transform(df[['Review Score']])
                
                # Add a column for the original hotel directory
                df['hotel_directory'] = df['Hotel Name']
                all_reviews.append(df)
    return pd.concat(all_reviews, ignore_index=True)

# Load and normalize data
df = load_and_normalize_data()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'base_input_path\\agoda_hotel_reviews'

In [None]:
# Function to lemmatize text
def preprocess_text(text):
    # Step 1: Convert text to lowercase
    text = text.lower()

    # Step 2: Remove punctuation
    # You can translate punctuation to None (removing it)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Step 3: Tokenization
    tokens = nltk.word_tokenize(text)

    # Step 4: Remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # Step 5: Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the processed text as a single string
    return ' '.join(lemmatized_tokens)

# Lemmatize the review content
df['cleaned_content'] = df['Review Content'].apply(preprocess_text)

In [None]:
# Convert normalized scores to binary labels (1 for positive, 0 for negative)
df['label'] = df['normalized_score'].apply(lambda x: 0 if x <= 0.25 else (2 if x >= 0.75 else 1))
print(df['label'])
print(df.head())

print(df[['normalized_score', 'label']].describe())


0        2
1        2
2        2
3        2
4        2
        ..
10237    2
10238    2
10239    1
10240    2
10241    2
Name: label, Length: 10242, dtype: int64
  Review Provider  Review ID  \
0           Agoda  831639368   
1           Agoda  830646554   
2           Agoda  830862563   
3           Agoda  830828036   
4           Agoda  823596890   

                                      Review Content  Review Score  \
0  The hotel's facilities were top notch, from th...           9.6   
1  I enjoyed my stay at the Bai Hotel. The prices...          10.0   
2                        My favorite hotel\r\n \r\n           10.0   
3  This hotel was so accommodating. Good staff, g...          10.0   
4                         Well recommended\r\n \r\n           10.0   

                 Review Time  normalized_score hotel_name  \
0  2024-07-16T09:26:00+07:00              0.95      hotel   
1  2024-07-15T12:05:00+07:00              1.00      hotel   
2  2024-07-15T07:56:00+07:00             

In [None]:
# Count the occurrences of each label
positive_count = (df['label'] == 1).sum()
negative_count = (df['label'] == -1).sum()
neutral_count = (df['label'] == 0).sum()

# Print the counts
print(f'Positive reviews: {positive_count}')
print(f'Negative reviews: {negative_count}')
print(f'Neutral reviews: {neutral_count}')
print(f"Total: {df['label'].count()}")

Positive reviews: 1445
Negative reviews: 0
Neutral reviews: 394
Total: 10242


In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_content'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_content'])
padded_sequences = pad_sequences(sequences, maxlen=100)

# Prepare labels
labels = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.3, stratify=labels, random_state=42)


Data saved successfully.


## Saving of cleaned and preprocessed data

In [None]:
# Save cleaned data to respective folders based on hotel mapping
for hotel_directory, group_df in df.groupby('hotel_directory'):
    output_folder = os.path.join(base_output_path, output_folder_names[hotel_directory])
    os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

    # Save the cleaned DataFrame to CSV
    group_df.to_csv(os.path.join(output_folder, f'cleaned_reviews_{output_folder_names[hotel_directory]}.csv'), index=False)

print("Data saved successfully.")

In [None]:
# Model creation
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),  # Corrected placement
    Dropout(0.5),
    Dense(3, activation='softmax')  # Assuming 3 classes for multi-class classification
])

# Model Compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Training
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')  # Rounded to 4 decimal places for clarity

# Plot training & validation accuracy and loss values
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()

Epoch 1/10




[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 45ms/step - accuracy: 0.8049 - loss: 0.6755 - val_accuracy: 0.8347 - val_loss: 0.4538
Epoch 2/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.8670 - loss: 0.4048 - val_accuracy: 0.8473 - val_loss: 0.4414
Epoch 3/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.8999 - loss: 0.3021 - val_accuracy: 0.8410 - val_loss: 0.4540
Epoch 4/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.9222 - loss: 0.2435 - val_accuracy: 0.8459 - val_loss: 0.5508
Epoch 5/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.9444 - loss: 0.1717 - val_accuracy: 0.8361 - val_loss: 0.6061
Epoch 6/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9571 - loss: 0.1455 - val_accuracy: 0.8368 - val_loss: 0.6274
Epoch 7/10
[1m 89/180[0m [32m

KeyboardInterrupt: 

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example input reviews
new_reviews = [
    {"review": "The hotel staff was incredibly helpful and the room was clean and spacious. Definitely a positive experience!", "score": 1.0},
    {"review": "The location was good, but the room had a strange smell and the service was just okay.", "score": 0.6},
    {"review": "I had a terrible stay. The room was dirty and the staff was rude. Not worth the price.", "score": 0.2},
    {"review": "Amazing experience! The view from the room was breathtaking and the food at the restaurant was top-notch.", "score": 1.0},
    {"review": "The room was decent, but the Wi-Fi was slow and unreliable. It was an average stay overall.", "score": 0.5},
    {"review": "Terrible service! We waited over an hour for our room to be ready and the staff was not apologetic.", "score": 0.3},
    {"review": "The hotel was in a perfect location, close to all the major attractions. The room was comfortable and well-maintained.", "score": 0.9},
    {"review": "The facilities were outdated, and the air conditioning barely worked. I was disappointed with my stay.", "score": 0.4},
    {"review": "Great value for money! The hotel offered a lot of amenities and the staff was very friendly.", "score": 0.8},
    {"review": "The breakfast was good, but the room was small and the bed was uncomfortable. It was an okay stay.", "score": 0.6}
]

# Define a function to preprocess new input data
def preprocess_new_data(new_data):
    # Apply the same preprocessing function
    cleaned_data = [preprocess_text(entry["review"]) for entry in new_data]
    
    # Convert to sequences using the trained tokenizer
    sequences = tokenizer.texts_to_sequences(cleaned_data)
    
    # Pad the sequences
    padded_sequences = pad_sequences(sequences, maxlen=100)  # Use the same maxlen as your training data

    return padded_sequences

# Prepare reviews for prediction
padded_sequences = preprocess_new_data(new_reviews)

# Make predictions
predictions = model.predict(padded_sequences)

# Interpreting the predictions
for i, prediction in enumerate(predictions):
    predicted_label = np.argmax(prediction)  # Get the index of the highest probability
    predicted_score = prediction[predicted_label]  # The probability of the predicted class
    actual_score = new_reviews[i]["score"]

    # Assign sentiment based on the predicted label
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    sentiment = sentiment_labels[predicted_label]

    # Print the results
    print(f"Review: {new_reviews[i]['review']}")
    print(f"Actual Score: {actual_score:.2f}")
    print(f"Predicted Label: {sentiment}")
    print(f"Negative Probability: {prediction[0]:.2f}")
    print(f"Neutral Probability: {prediction[1]:.2f}")
    print(f"Positive Probability: {prediction[2]:.2f}")
    print("-" * 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Review: The hotel staff was incredibly helpful and the room was clean and spacious. Definitely a positive experience!
Actual Score: 1.00
Predicted Label: Positive
Negative Probability: 0.00
Neutral Probability: 0.05
Positive Probability: 0.95
--------------------------------------------------
Review: The location was good, but the room had a strange smell and the service was just okay.
Actual Score: 0.60
Predicted Label: Neutral
Negative Probability: 0.00
Neutral Probability: 0.99
Positive Probability: 0.00
--------------------------------------------------
Review: I had a terrible stay. The room was dirty and the staff was rude. Not worth the price.
Actual Score: 0.20
Predicted Label: Negative
Negative Probability: 0.95
Neutral Probability: 0.05
Positive Probability: 0.00
--------------------------------------------------
Review: Amazing experience! The view from the room was breathtaking and the food at the resta

In [None]:
from datetime import datetime
import os

# Assuming `model` is your Keras model
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f'saved_models/general_sentiment_softmax_{timestamp}.keras'

# Make sure the directory exists
os.makedirs(os.path.dirname(file_name), exist_ok=True)

# Save the model with the unique file name
model.save(file_name)
print(f"Model saved as {file_name}")
model.summary()

Model saved as saved_models/general_sentiment_softmax_20240828_222152.keras
