# Preparing the data to be used for model fine tuning for consistent results

In [8]:
import pandas as pd
from textblob import TextBlob
import ast
import glob

# Load the dataset
aspect_path = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\*.csv"
files = glob.glob(aspect_path)

# Initialize an empty list to store dataframes
dataframes = []

# Iterate through each CSV file and load it into a DataFrame
for f in files:
    print(f"Processing file: {f}")
    df_temp = pd.read_csv(f)
    
    # Ensure 'Classification' is string and handle missing values
    df_temp['Classification'] = df_temp['Classification'].fillna('').astype(str)
    
    # Append the dataframe to the list
    dataframes.append(df_temp)

# Concatenate all the DataFrames into one
df = pd.concat(dataframes, ignore_index=True)

# Remove duplicates based on the 'Opinion' column (or a combination of columns if needed)
df = df.drop_duplicates(subset=['Opinion'])

# Define function to get sentiment using TextBlob
def get_sentiment(opinion):
    analysis = TextBlob(opinion)
    # Determine the polarity (-1 is very negative, +1 is very positive)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Define function to label aspects with sentiments
def label_aspects(row):
    sentiments = {}
    aspects = row['Classification'].split(':')
    for aspect in aspects:
        sentiment = get_sentiment(row['Opinion'])
        sentiments[aspect] = sentiment
    return sentiments

# Apply the labeling function to each row
df['Aspect_Sentiment'] = df.apply(label_aspects, axis=1)

# Convert 'Aspect_Sentiment' to actual dictionaries if they are strings
df['Aspect_Sentiment'] = df['Aspect_Sentiment'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Manually explode the dictionary into separate rows
rows = []

for i, row in df.iterrows():
    for aspect, sentiment in row['Aspect_Sentiment'].items():
        rows.append({
            'Opinion': row['Opinion'],
            'Aspect': aspect,
            'Sentiment': sentiment
        })

# Create a new DataFrame with the exploded aspects and sentiments
exploded_df = pd.DataFrame(rows)

# Remove duplicates in the exploded DataFrame if needed
exploded_df = exploded_df.drop_duplicates()

# Check the final output
print("Exploded DataFrame:")
print(exploded_df.head())

# Save the labeled dataset
exploded_df.to_csv(r'C:\Users\andyb\Desktop\Coding Files\PointView\datasets\labeled_dataset.csv', index=False)

Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_1.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_11.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_12.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_13.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_14.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_15.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_16.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_17.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\a

# BERT

In [9]:
import torch

torch.cuda.is_available()

True

In [10]:
# import os
# import pandas as pd
# import torch
# import numpy as np
# import random
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# import torch.backends.cudnn as cudnn

# # Set random seed for reproducibility
# def set_seed(seed=42):
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)

# set_seed(42)

# # Ensure deterministic behavior
# cudnn.deterministic = True
# cudnn.benchmark = False

# # Check if GPU is available and set the device accordingly
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f'Using device: {device}')

# # Load pre-trained model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# # Move the model to the GPU
# model.to(device)

# # Set model to evaluation mode to disable dropout
# model.eval()

# # Define your specific KPIs
# specific_kpis = ['food', 'staff', 'comfort', 'facilities', 'value for money']

# # Load the labeled dataset (for fine-tuning)
# labeled_data_path = r"C:\Users\andyb\Desktop\Coding Files\pointview\datasets\labeled_dataset.csv"
# df = pd.read_csv(labeled_data_path)

# # Convert the sentiment labels to integers
# label_mapping = {'Positive': 1, 'Negative': 0}
# df['Sentiment'] = df['Sentiment'].map(label_mapping)

# # Identify rows with NaN values after mapping
# nan_rows = df[df['Sentiment'].isna()]
# print("Rows with NaN values in 'Sentiment':")
# print(nan_rows)

# # Handle NaN values
# df = df.dropna(subset=['Sentiment'])  # Option 1: Drop rows with NaN values

# # Ensure the labels are of type int
# df['Sentiment'] = df['Sentiment'].astype(int)

# # Inspect the label values and their data types
# print("Unique values in label column:", df['Sentiment'].unique())
# print("Data type of label column:", df['Sentiment'].dtype)

# # Tokenize the dataset
# train_encodings = tokenizer(df['Opinion'].tolist(), truncation=True, padding=True, max_length=512)
# train_labels = df['Sentiment'].tolist()  # These should now be integers

# # Create a custom Dataset class
# class ReviewDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item

#     def __len__(self):
#         return len(self.labels)

# train_dataset = ReviewDataset(train_encodings, train_labels)

# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset          # training dataset
# )

# # Train the model
# trainer.train()

# # Save the fine-tuned model
# model.save_pretrained("./fine_tuned_model")
# tokenizer.save_pretrained("./fine_tuned_model")


# # Load the fine-tuned model
# model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
# tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# # Move the model to the GPU
# model.to(device)

# # Set model to evaluation mode
# model.eval()

# # Base directory containing the hotel reviews
# base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# # Output directory to save the sentiment results
# output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # Function to extract aspects from review content based on the KPIs
# def extract_aspects(review, aspects_list):
#     return [aspect for aspect in aspects_list if aspect.lower() in review.lower()]

# # Check if the model is on the GPU
# print(f"Model is on device: {next(model.parameters()).device}")

# # When you're processing inputs:
# def predict_sentiment(review, aspect):
#     input_text = f"{aspect}: {review}"
#     inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)  # Move inputs to GPU
#     print(f"Inputs are on device: {inputs['input_ids'].device}")  # Check if the inputs are on GPU
#     outputs = model(**inputs)
#     sentiment = torch.argmax(outputs.logits, dim=1).item()
#     return "Positive" if sentiment == 1 else "Negative"

# # Loop through each hotel directory and process the combined data
# for hotel_dir in os.listdir(base_dir):
#     hotel_path = os.path.join(base_dir, hotel_dir)
    
#     if os.path.isdir(hotel_path):  # Check if it's a directory
#         combined_df = pd.DataFrame()  # Initialize an empty DataFrame to combine all files

#         # Combine all CSV files within the hotel directory
#         for csv_file in os.listdir(hotel_path):
#             if csv_file.endswith('.csv'):
#                 file_path = os.path.join(hotel_path, csv_file)
#                 temp_df = pd.read_csv(file_path)
#                 combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

#         # Perform the sentiment analysis on the combined DataFrame
#         combined_df['Aspects'] = combined_df['Review Content'].apply(lambda x: extract_aspects(x, specific_kpis))
#         combined_df['Aspects'] = combined_df['Aspects'].apply(lambda x: x if x else [])

#         combined_df['Sentiment_Results'] = combined_df.apply(
#             lambda row: {aspect: predict_sentiment(row['Review Content'], aspect) for aspect in row['Aspects']},
#             axis=1
#         )

#         # Initialize dictionary to track positive/negative counts for each KPI
#         aspect_sentiments = {aspect: {'positive': 0, 'negative': 0} for aspect in specific_kpis}

#         # Count the positive and negative sentiments for each aspect
#         for index, row in combined_df.iterrows():
#             for aspect, sentiment in row['Sentiment_Results'].items():
#                 if sentiment == "Positive":
#                     aspect_sentiments[aspect]['positive'] += 1
#                 else:
#                     aspect_sentiments[aspect]['negative'] += 1

#         # Calculate sentiment percentages for each aspect
#         total_reviews = len(combined_df)
#         for aspect, counts in aspect_sentiments.items():
#             counts['positive_percent'] = (counts['positive'] / total_reviews) * 100
#             counts['negative_percent'] = (counts['negative'] / total_reviews) * 100

#         # Create a folder for the hotel in the output directory
#         hotel_output_dir = os.path.join(output_dir, hotel_dir)
#         if not os.path.exists(hotel_output_dir):
#             os.makedirs(hotel_output_dir)

#         # Save the sentiment analysis results to a CSV file
#         output_file_path = os.path.join(hotel_output_dir, f"{hotel_dir}_sentiment_analysis.csv")
#         output_df = pd.DataFrame(aspect_sentiments).T
#         output_df.to_csv(output_file_path)

#         print(f"Processed {hotel_dir}, results saved to {output_file_path}")




In [11]:
import os
import pandas as pd
import torch
import numpy as np
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.backends.cudnn as cudnn
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import string

# Download necessary nltk data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Ensure deterministic behavior
cudnn.deterministic = True
cudnn.benchmark = False

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Preprocessing Functions for BERT

def clean_text(text):
    """
    Clean the input text by:
    1. Removing special characters and punctuation.
    2. Converting text to lowercase.
    3. Handling contractions (optional for your use case).
    """
    # Remove special characters, numbers, and punctuation
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # Convert to lowercase
    text = text.lower()

    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(lemmatized_tokens)

# Apply to labeled dataset
labeled_data_path = r"C:\Users\andyb\Desktop\Coding Files\pointview\datasets\labeled_dataset.csv"
df = pd.read_csv(labeled_data_path)

# Clean the 'Opinion' column
df['Opinion'] = df['Opinion'].apply(clean_text)

# Convert the sentiment labels to integers
label_mapping = {'Positive': 1, 'Negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_mapping)

# Handle NaN values in the Sentiment column
df = df.dropna(subset=['Sentiment'])

# Ensure labels are of type int
df['Sentiment'] = df['Sentiment'].astype(int)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize the dataset
train_encodings = tokenizer(train_df['Opinion'].tolist(), truncation=True, padding=True, max_length=512)
train_labels = train_df['Sentiment'].tolist()

test_encodings = tokenizer(test_df['Opinion'].tolist(), truncation=True, padding=True, max_length=512)
test_labels = test_df['Sentiment'].tolist()

# Create custom Dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for training and testing
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Trainer for training and evaluation
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(f"Evaluation metrics: {metrics}")

# Predict and evaluate on the test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Classification report for precision, recall, and F1-score
print(classification_report(test_labels, preds, target_names=['Negative', 'Positive']))

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Move the model to the GPU
model.to(device)

# Set model to evaluation mode
model.eval()

# Base directory containing the hotel reviews
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Output directory to save the sentiment results
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to extract aspects from review content based on the KPIs
def extract_aspects(review, aspects_list):
    return [aspect for aspect in aspects_list if aspect.lower() in review.lower()]

# Check if the model is on the GPU
print(f"Model is on device: {next(model.parameters()).device}")

# When you're processing inputs:
def predict_sentiment(review, aspect):
    input_text = f"{aspect}: {review}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)  # Move inputs to GPU
    outputs = model(**inputs)
    sentiment = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if sentiment == 1 else "Negative"

# Loop through each hotel directory and process the combined data
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):  # Check if it's a directory
        combined_df = pd.DataFrame()  # Initialize an empty DataFrame to combine all files

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Perform the sentiment analysis on the combined DataFrame
        combined_df['Aspects'] = combined_df['Review Content'].apply(lambda x: extract_aspects(x, specific_kpis))
        combined_df['Aspects'] = combined_df['Aspects'].apply(lambda x: x if x else [])

        combined_df['Sentiment_Results'] = combined_df.apply(
            lambda row: {aspect: predict_sentiment(row['Review Content'], aspect) for aspect in row['Aspects']},
            axis=1
        )

        # Initialize dictionary to track positive/negative counts for each KPI
        aspect_sentiments = {aspect: {'positive': 0, 'negative': 0} for aspect in specific_kpis}

        # Count the positive and negative sentiments for each aspect
        for index, row in combined_df.iterrows():
            for aspect, sentiment in row['Sentiment_Results'].items():
                if sentiment == "Positive":
                    aspect_sentiments[aspect]['positive'] += 1
                else:
                    aspect_sentiments[aspect]['negative'] += 1

        # Calculate sentiment percentages for each aspect
        total_reviews = len(combined_df)
        for aspect, counts in aspect_sentiments.items():
            counts['positive_percent'] = (counts['positive'] / total_reviews) * 100
            counts['negative_percent'] = (counts['negative'] / total_reviews) * 100

        # Create a folder for the hotel in the output directory
        hotel_output_dir = os.path.join(output_dir, hotel_dir)
        if not os.path.exists(hotel_output_dir):
            os.makedirs(hotel_output_dir)

        # Save the sentiment analysis results to a CSV file
        output_file_path = os.path.join(hotel_output_dir, f"{hotel_dir}_sentiment_analysis.csv")
        output_df = pd.DataFrame(aspect_sentiments).T
        output_df.to_csv(output_file_path)

        print(f"Processed {hotel_dir}, results saved to {output_file_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using device: cuda


  0%|          | 0/2493 [00:00<?, ?it/s]

{'loss': 0.0411, 'grad_norm': 0.006221645046025515, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.0979, 'grad_norm': 0.00314724282361567, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 0.0482, 'grad_norm': 4.707668304443359, 'learning_rate': 3e-06, 'epoch': 0.04}
{'loss': 0.0446, 'grad_norm': 0.29841139912605286, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.05}
{'loss': 0.0471, 'grad_norm': 0.02807762287557125, 'learning_rate': 5e-06, 'epoch': 0.06}
{'loss': 0.0027, 'grad_norm': 0.02131219021975994, 'learning_rate': 6e-06, 'epoch': 0.07}
{'loss': 0.0561, 'grad_norm': 0.01366171520203352, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}
{'loss': 0.0033, 'grad_norm': 0.00858778040856123, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.1}
{'loss': 0.0006, 'grad_norm': 0.033091384917497635, 'learning_rate': 9e-06, 'epoch': 0.11}
{'loss': 0.0292, 'grad_norm': 0.03683312609791756, 'learning_rate': 1e-05, 'epoch': 0.12}
{'loss': 0.0473, 'g

  0%|          | 0/416 [00:00<?, ?it/s]

Evaluation metrics: {'eval_loss': 0.013381868600845337, 'eval_runtime': 59.9438, 'eval_samples_per_second': 55.402, 'eval_steps_per_second': 6.94, 'epoch': 3.0}


  0%|          | 0/416 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    Negative       0.99      0.97      0.98       144
    Positive       1.00      1.00      1.00      3177

    accuracy                           1.00      3321
   macro avg       0.99      0.99      0.99      3321
weighted avg       1.00      1.00      1.00      3321

Model is on device: cuda:0
Processed 1_bai_hotel, results saved to C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results\1_bai_hotel\1_bai_hotel_sentiment_analysis.csv
Processed 2_dusit_thani_mactan, results saved to C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results\2_dusit_thani_mactan\2_dusit_thani_mactan_sentiment_analysis.csv
Processed 3_fairfield_by_marriott_cebu, results saved to C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results\3_fairfield_by_marriott_cebu\3_fairfield_by_marriott_cebu_sentiment_analysis.csv
Processed 4_jpark_island_resort_and_waterpark, results saved to C:\Users\andyb\Desktop\C

In [12]:
import pandas as pd
import os

# Define the output directory where sentiment results are saved
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

# Function to load and display sentiment results from all hotels
def load_and_display_sentiment_results(output_dir):
    for hotel_dir in os.listdir(output_dir):
        hotel_path = os.path.join(output_dir, hotel_dir)
        
        if os.path.isdir(hotel_path):  # Check if it's a directory
            for csv_file in os.listdir(hotel_path):
                if csv_file.endswith('_sentiment_analysis.csv'):
                    file_path = os.path.join(hotel_path, csv_file)
                    
                    # Load the CSV file into a DataFrame
                    df = pd.read_csv(file_path, index_col=0)
                    
                    # Display the DataFrame
                    print(f"Sentiment Analysis for {hotel_dir}:")
                    print(df)
                    print("\n" + "="*50 + "\n")

# Call the function to display all sentiment results
load_and_display_sentiment_results(output_dir)


Sentiment Analysis for 1_bai_hotel:
                      positive  negative  positive_percent  negative_percent
food                    1111.0      46.0         27.506809          1.138896
staff                   1679.0      82.0         41.569695          2.030205
comfort & facilities       1.0       0.0          0.024759          0.000000
value for money           53.0       1.0          1.312206          0.024759


Sentiment Analysis for 2_dusit_thani_mactan:
                      positive  negative  positive_percent  negative_percent
food                     536.0      55.0         28.389831          2.913136
staff                    882.0      63.0         46.716102          3.336864
comfort & facilities       0.0       0.0          0.000000          0.000000
value for money           12.0       1.0          0.635593          0.052966


Sentiment Analysis for 3_fairfield_by_marriott_cebu:
                      positive  negative  positive_percent  negative_percent
food           

# KPI Experiment

In [13]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Define your specific KPIs
specific_kpis = ['food', 'staff', 'comfort & facilities', 'value for money']

# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(reviews, top_n=10):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000, min_df=0.01, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()
    
    top_keywords = []
    for doc in tfidf_scores:
        sorted_indices = doc.argsort()[-top_n:]
        top_keywords.extend([feature_names[i] for i in sorted_indices])
    
    return Counter(top_keywords).most_common(top_n)

# Example usage with a single hotel's reviews
def get_additional_kpis(reviews):
    keywords = extract_keywords_tfidf(reviews, top_n=10)
    additional_kpis = [keyword for keyword, _ in keywords if keyword not in specific_kpis]
    return additional_kpis

# Directory paths
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs
        reviews = combined_df['Review Content'].tolist()
        additional_kpis = get_additional_kpis(reviews)
        
        print(f"Additional KPIs identified for {hotel_dir}: {additional_kpis}")


Additional KPIs identified for 1_bai_hotel: ['experience', 'expectations', 'extra', 'extremely', 'facilities', 'family', 'exceptional', 'worth', 'hotel', 'fantastic']
Additional KPIs identified for 2_dusit_thani_mactan: ['far', 'feel', 'felt', 'filipino', 'zee', 'fantastic', 'fine', 'hotel']
Additional KPIs identified for 3_fairfield_by_marriott_cebu: ['facilities', 'fairfield', 'face', 'extremely', 'extra', 'yes', 'fabulous', 'nice', 'room', 'great']
Additional KPIs identified for 4_jpark_island_resort_and_waterpark: ['facility', 'fact', 'facilities', 'families', 'family', 'young', 'fantastic', 'extra', 'felt', 'far']
Additional KPIs identified for 5_seda_ayala_center_cebu: ['extra', 'facilities', 'experience', 'family', 'fantastic', 'worth', 'far', 'hotel', 'fast', 'location']
Additional KPIs identified for 6_waterfront_hotel_and_casino: ['excellent', 'event', 'exceptional', 'especially', 'expensive', 'experience', 'years', 'extra', 'enjoyed', 'hotel']


In [14]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Function to preprocess reviews for LDA
def preprocess_for_lda(reviews):
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in review.lower().split() if word not in stop_words] for review in reviews]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus, dictionary

# Function to extract topics using LDA
def extract_topics_lda(corpus, dictionary, num_topics=5):
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=5)
    topic_keywords = []
    for topic in topics:
        words = topic[1].split(' + ')
        keywords = [word.split('*')[-1].replace('"', '').strip() for word in words]
        topic_keywords.extend(keywords)
    return list(set(topic_keywords))

# Example usage with a single hotel's reviews
def get_additional_kpis_lda(reviews):
    corpus, dictionary = preprocess_for_lda(reviews)
    topics = extract_topics_lda(corpus, dictionary, num_topics=5)
    additional_kpis = [topic for topic in topics if topic not in specific_kpis]
    return additional_kpis

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs using LDA
        reviews = combined_df['Review Content'].tolist()
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        
        print(f"Additional KPIs identified using LDA for {hotel_dir}: {additional_kpis_lda}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Additional KPIs identified using LDA for 1_bai_hotel: ['birthday', 'hotel', 'breakfast', 'thank', 'us', 'even', 'good', 'really', 'customer', 'service', 'room', 'great', 'bai', 'buffet']
Additional KPIs identified using LDA for 2_dusit_thani_mactan: ['pool', 'hotel', 'us', 'good', '-', 'service', 'room', 'great']
Additional KPIs identified using LDA for 3_fairfield_by_marriott_cebu: ['clean', 'time', 'hotel', 'breakfast', 'incredible', 'bed', 'amazing', 'toilet', 'rooms', 'beautiful', 'nice', 'good', 'room', 'great', 'would', 'many']
Additional KPIs identified using LDA for 4_jpark_island_resort_and_waterpark: ['pool', 'hotel', 'water', '•', 'resort', 'good', 'kids', '-', 'room', 'great', 'place']
Additional KPIs identified using LDA for 5_seda_ayala_center_cebu: ['mall', 'hotel', 'breakfast', 'ayala', 'seda', 'check', 'stay', 'good', '-', 'room', 'location']
Additional KPIs identified using LDA for 6_waterfront_hotel_and_casino: ['hotel', 'breakfast', 'check', 'stay', 'good', 'nice', 

In [15]:
def combine_kpis(predefined_kpis, additional_kpis):
    return list(set(predefined_kpis + additional_kpis))

# Combine TF-IDF and LDA results with predefined KPIs
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        reviews = combined_df['Review Content'].tolist()

        # Extract additional KPIs using both TF-IDF and LDA
        additional_kpis_tfidf = get_additional_kpis(reviews)
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        additional_kpis = list(set(additional_kpis_tfidf + additional_kpis_lda))

        # Combine with predefined KPIs
        all_kpis = combine_kpis(specific_kpis, additional_kpis)

        print(f"All KPIs for {hotel_dir}: {all_kpis}")


All KPIs for 1_bai_hotel: ['birthday', 'like', 'hotel', 'food', 'even', 'facilities', 'really', 'room', 'great', 'cant', 'exceptional', 'staff', 'value for money', 'breakfast', 'fantastic', 'extra', 'extremely', 'customer', 'would', 'expectations', 'also', 'comfort & facilities', 'us', 'experience', 'good', 'worth', 'service', 'family', 'bai']
All KPIs for 2_dusit_thani_mactan: ['hotel', 'food', 'room', 'great', 'staff', 'filipino', 'value for money', 'stay', 'fantastic', 'pool', 'comfort & facilities', 'feel', 'zee', 'felt', 'view', 'far', 'fine', 'dusit', 'good', 'service']
All KPIs for 3_fairfield_by_marriott_cebu: ['clean', 'like', 'hotel', 'bed', 'yes', 'food', 'facilities', 'really', 'room', 'great', 'fairfield', 'everything', 'staff', 'value for money', 'breakfast', 'extra', 'best', 'extremely', 'rooms', 'nice', 'comfort & facilities', 'us', 'fabulous', 'experience', 'face', 'good']
All KPIs for 4_jpark_island_resort_and_waterpark: ['young', 'hotel', 'food', 'facilities', 'famil