In [1]:
import os
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer

# Download necessary nltk data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(42)

# Preprocessing functions
def clean_text(text):
    # Remove punctuation and convert text to lowercase
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = text.lower()

    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Load dataset and clean 'Opinion' column
labeled_data_path = r"C:\Users\andyb\Desktop\Coding Files\pointview\datasets\labeled_dataset.csv"
df = pd.read_csv(labeled_data_path)
df['Opinion'] = df['Opinion'].apply(clean_text)

# Convert sentiment labels to integers
label_mapping = {'Positive': 1, 'Negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_mapping)

# Remove rows with NaN Sentiment values
df = df.dropna(subset=['Sentiment'])

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize the dataset
MAX_WORDS = 10000
MAX_LEN = 512
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['Opinion'])

# Convert text to sequences and pad them
train_sequences = tokenizer.texts_to_sequences(train_df['Opinion'])
train_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(test_df['Opinion'])
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Prepare the labels
train_labels = np.array(train_df['Sentiment'])
test_labels = np.array(test_df['Sentiment'])

# Build BiLSTM model
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification (Positive/Negative)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_data=(test_padded, test_labels))

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f"Test accuracy: {test_acc}")

# Predict on the test set
predictions = (model.predict(test_padded) > 0.5).astype("int32")

# Classification report
print(classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))

# Save the model
model.save("bilstm_sentiment_model.h5")

# Predict sentiment for new data
def predict_sentiment_bilstm(review):
    review = clean_text(review)
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LEN, padding='post', truncating='post')
    sentiment = model.predict(padded_sequence)
    return "Positive" if sentiment > 0.5 else "Negative"

# Example usage
review = "The staff were very helpful and friendly"
print(f"Review: {review}")
print(f"Predicted Sentiment: {predict_sentiment_bilstm(review)}")

# Aspect-Based Sentiment Analysis

# Base directory containing the hotel reviews
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results_biLSTM"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def extract_aspects(review, aspects_list):
    return [aspect for aspect in aspects_list if aspect.lower() in review.lower()]

# Process each hotel directory
specific_kpis = ["food", "staff", "comfort & facilities", "value for money"]  # Example aspects
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        combined_df['Aspects'] = combined_df['Review Content'].apply(lambda x: extract_aspects(x, specific_kpis))
        combined_df['Aspects'] = combined_df['Aspects'].apply(lambda x: x if x else [])

        combined_df['Sentiment_Results'] = combined_df.apply(
            lambda row: {aspect: predict_sentiment_bilstm(row['Review Content']) for aspect in row['Aspects']},
            axis=1
        )

        # Initialize dictionary to track positive/negative counts for each KPI
        aspect_sentiments = {aspect: {'positive': 0, 'negative': 0} for aspect in specific_kpis}

        # Count the positive and negative sentiments for each aspect
        for index, row in combined_df.iterrows():
            for aspect, sentiment in row['Sentiment_Results'].items():
                if sentiment == "Positive":
                    aspect_sentiments[aspect]['positive'] += 1
                else:
                    aspect_sentiments[aspect]['negative'] += 1

        # Calculate sentiment percentages for each aspect
        total_reviews = len(combined_df)
        for aspect, counts in aspect_sentiments.items():
            counts['positive_percent'] = (counts['positive'] / total_reviews) * 100
            counts['negative_percent'] = (counts['negative'] / total_reviews) * 100

        # Save the sentiment analysis results to a CSV file
        hotel_output_dir = os.path.join(output_dir, hotel_dir)
        if not os.path.exists(hotel_output_dir):
            os.makedirs(hotel_output_dir)

        output_file_path = os.path.join(hotel_output_dir, f"{hotel_dir}_sentiment_analysis.csv")
        output_df = pd.DataFrame(aspect_sentiments).T
        output_df.to_csv(output_file_path)

        print(f"Processed {hotel_dir}, results saved to {output_file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 354ms/step - accuracy: 0.9527 - loss: 0.2198 - val_accuracy: 0.9696 - val_loss: 0.0894
Epoch 2/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 354ms/step - accuracy: 0.9733 - loss: 0.0870 - val_accuracy: 0.9771 - val_loss: 0.0698
Epoch 3/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 362ms/step - accuracy: 0.9866 - loss: 0.0414 - val_accuracy: 0.9726 - val_loss: 0.0732
Epoch 4/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 362ms/step - accuracy: 0.9921 - loss: 0.0274 - val_accuracy: 0.9895 - val_loss: 0.0487
Epoch 5/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 372ms/step - accuracy: 0.9965 - loss: 0.0118 - val_accuracy: 0.9904 - val_loss: 0.0575
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 110ms/step - accuracy: 0.9889 - loss: 0.0553
Test accuracy: 0.9903643727302551
[1m104/104[0m [32m━━━━━━━━



              precision    recall  f1-score   support

    Negative       0.91      0.86      0.89       144
    Positive       0.99      1.00      0.99      3177

    accuracy                           0.99      3321
   macro avg       0.95      0.93      0.94      3321
weighted avg       0.99      0.99      0.99      3321

Review: The staff were very helpful and friendly
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Predicted Sentiment: Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━