In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
import matplotlib.pyplot as plt
import nltk
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define directories for each dataset
directories = {
    'agoda_hotel_reviews': (1, 5),  # Example score range
    'tripadvisor_hotel_reviews': (1, 5),
    'klook_hotel_reviews': (1, 10),  # Adjust according to the actual score range
    'booking_hotel_reviews': (1, 10)
}

# Function to load and normalize data
def load_and_normalize_data():
    all_reviews = []
    for directory, (min_score, max_score) in directories.items():
        path = os.path.join('raw data', directory)
        for filename in os.listdir(path):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(path, filename))
                df['normalized_score'] = ((df['Review Score'] - min_score) / (max_score - min_score)) * (1 - 0) + 0
                all_reviews.append(df)
    return pd.concat(all_reviews, ignore_index=True)

# Load and normalize data
df = load_and_normalize_data()

# Convert normalized scores to binary labels (1 for positive, 0 for negative)
df['label'] = df['normalized_score'].apply(lambda x: 1 if x >= 0.5 else 0)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['cleaned_review'] = df['Review Content'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Tokenize and create Word2Vec model
tokenized_reviews = df['cleaned_review'].apply(lambda x: x.split())
w2v_model = Word2Vec(tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Create an embedding matrix
vocab_size = len(w2v_model.wv.key_to_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in w2v_model.wv.key_to_index.items():
    embedding_matrix[i] = w2v_model.wv[word]

# Convert text to sequences
tokenizer = w2v_model.wv.key_to_index
sequences = [[tokenizer[word] for word in review.split() if word in tokenizer] for review in df['cleaned_review']]
padded_sequences = pad_sequences(sequences, maxlen=100)

# Splitting data into training and test sets
split_index = int(0.8 * len(padded_sequences))
X_train, X_test = padded_sequences[:split_index], padded_sequences[split_index:]
y_train, y_test = np.array(df['label'][:split_index]), np.array(df['label'][split_index:])

# Model Creation
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=100, trainable=False),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(128)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Use 'softmax' for multi-class classification
])

# Model Compilation
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Training
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Plot training & validation accuracy and loss values
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

# Save the model
model.save('sentiment_model.h5')

# Load the trained model for testing
model = load_model('sentiment_model.h5')

def preprocess_review(review, tokenizer, max_len=100):
    stop_words = set(stopwords.words('english'))
    cleaned_review = ' '.join([word for word in review.split() if word.lower() not in stop_words])
    sequence = [tokenizer[word] for word in cleaned_review.split() if word in tokenizer]
    padded_sequence = pad_sequences([sequence], maxlen=max_len)
    return padded_sequence

def predict_sentiment(review, model, tokenizer):
    processed_review = preprocess_review(review, tokenizer)
    prediction = model.predict(processed_review)
    return prediction[0][0]

# Example usage
review = "The hotel was fantastic with excellent service"
score = predict_sentiment(review, model, tokenizer)
print(f'Sentiment score: {score}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: 'score'