# Preparing the data to be used for model fine tuning for consistent results

In [5]:
import pandas as pd
from textblob import TextBlob
import ast
import glob

# Load the dataset
aspect_path = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\*.csv"
files = glob.glob(aspect_path)

# Initialize an empty list to store dataframes
dataframes = []

# Iterate through each CSV file and load it into a DataFrame
for f in files:
    print(f"Processing file: {f}")
    df_temp = pd.read_csv(f)
    
    # Ensure 'Classification' is string and handle missing values
    df_temp['Classification'] = df_temp['Classification'].fillna('').astype(str)
    
    # Append the dataframe to the list
    dataframes.append(df_temp)

# Concatenate all the DataFrames into one
df = pd.concat(dataframes, ignore_index=True)

# Remove duplicates based on the 'Opinion' column (or a combination of columns if needed)
df = df.drop_duplicates(subset=['Opinion'])

# Define function to get sentiment using TextBlob
def get_sentiment(opinion):
    analysis = TextBlob(opinion)
    # Determine the polarity (-1 is very negative, +1 is very positive)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Define function to label aspects with sentiments
def label_aspects(row):
    sentiments = {}
    aspects = row['Classification'].split(':')
    for aspect in aspects:
        sentiment = get_sentiment(row['Opinion'])
        sentiments[aspect] = sentiment
    return sentiments

# Apply the labeling function to each row
df['Aspect_Sentiment'] = df.apply(label_aspects, axis=1)

# Convert 'Aspect_Sentiment' to actual dictionaries if they are strings
df['Aspect_Sentiment'] = df['Aspect_Sentiment'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Manually explode the dictionary into separate rows
rows = []

for i, row in df.iterrows():
    for aspect, sentiment in row['Aspect_Sentiment'].items():
        rows.append({
            'Opinion': row['Opinion'],
            'Aspect': aspect,
            'Sentiment': sentiment
        })

# Create a new DataFrame with the exploded aspects and sentiments
exploded_df = pd.DataFrame(rows)

# Remove duplicates in the exploded DataFrame if needed
exploded_df = exploded_df.drop_duplicates()

# Check the final output
print("Exploded DataFrame:")
print(exploded_df.head())

# Save the labeled dataset
exploded_df.to_csv(r'C:\Users\andyb\Desktop\Coding Files\PointView\datasets\labeled_dataset.csv', index=False)

Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_1.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_11.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_12.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_13.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_14.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_15.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_16.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\processed_batch_17.csv
Processing file: C:\Users\andyb\Desktop\Coding Files\PointView\datasets\a

# BERT

In [6]:
import torch

torch.cuda.is_available()

True

In [7]:
import os
import pandas as pd
import torch
import numpy as np
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.backends.cudnn as cudnn

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Ensure deterministic behavior
cudnn.deterministic = True
cudnn.benchmark = False

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the GPU
model.to(device)

# Set model to evaluation mode to disable dropout
model.eval()

# Define your specific KPIs
specific_kpis = ['food', 'staff', 'comfort', 'facilities', 'value for money']

# Load the labeled dataset (for fine-tuning)
labeled_data_path = r"C:\Users\andyb\Desktop\Coding Files\pointview\datasets\labeled_dataset.csv"
df = pd.read_csv(labeled_data_path)

# Convert the sentiment labels to integers
label_mapping = {'Positive': 1, 'Negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_mapping)

# Identify rows with NaN values after mapping
nan_rows = df[df['Sentiment'].isna()]
print("Rows with NaN values in 'Sentiment':")
print(nan_rows)

# Handle NaN values
df = df.dropna(subset=['Sentiment'])  # Option 1: Drop rows with NaN values

# Ensure the labels are of type int
df['Sentiment'] = df['Sentiment'].astype(int)

# Inspect the label values and their data types
print("Unique values in label column:", df['Sentiment'].unique())
print("Data type of label column:", df['Sentiment'].dtype)

# Tokenize the dataset
train_encodings = tokenizer(df['Opinion'].tolist(), truncation=True, padding=True, max_length=512)
train_labels = df['Sentiment'].tolist()  # These should now be integers

# Create a custom Dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset          # training dataset
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Move the model to the GPU
model.to(device)

# Set model to evaluation mode
model.eval()

# Base directory containing the hotel reviews
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Output directory to save the sentiment results
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to extract aspects from review content based on the KPIs
def extract_aspects(review, aspects_list):
    return [aspect for aspect in aspects_list if aspect.lower() in review.lower()]

# Check if the model is on the GPU
print(f"Model is on device: {next(model.parameters()).device}")

# When you're processing inputs:
def predict_sentiment(review, aspect):
    input_text = f"{aspect}: {review}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)  # Move inputs to GPU
    print(f"Inputs are on device: {inputs['input_ids'].device}")  # Check if the inputs are on GPU
    outputs = model(**inputs)
    sentiment = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if sentiment == 1 else "Negative"

# Loop through each hotel directory and process the combined data
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):  # Check if it's a directory
        combined_df = pd.DataFrame()  # Initialize an empty DataFrame to combine all files

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Perform the sentiment analysis on the combined DataFrame
        combined_df['Aspects'] = combined_df['Review Content'].apply(lambda x: extract_aspects(x, specific_kpis))
        combined_df['Aspects'] = combined_df['Aspects'].apply(lambda x: x if x else [])

        combined_df['Sentiment_Results'] = combined_df.apply(
            lambda row: {aspect: predict_sentiment(row['Review Content'], aspect) for aspect in row['Aspects']},
            axis=1
        )

        # Initialize dictionary to track positive/negative counts for each KPI
        aspect_sentiments = {aspect: {'positive': 0, 'negative': 0} for aspect in specific_kpis}

        # Count the positive and negative sentiments for each aspect
        for index, row in combined_df.iterrows():
            for aspect, sentiment in row['Sentiment_Results'].items():
                if sentiment == "Positive":
                    aspect_sentiments[aspect]['positive'] += 1
                else:
                    aspect_sentiments[aspect]['negative'] += 1

        # Calculate sentiment percentages for each aspect
        total_reviews = len(combined_df)
        for aspect, counts in aspect_sentiments.items():
            counts['positive_percent'] = (counts['positive'] / total_reviews) * 100
            counts['negative_percent'] = (counts['negative'] / total_reviews) * 100

        # Create a folder for the hotel in the output directory
        hotel_output_dir = os.path.join(output_dir, hotel_dir)
        if not os.path.exists(hotel_output_dir):
            os.makedirs(hotel_output_dir)

        # Save the sentiment analysis results to a CSV file
        output_file_path = os.path.join(hotel_output_dir, f"{hotel_dir}_sentiment_analysis.csv")
        output_df = pd.DataFrame(aspect_sentiments).T
        output_df.to_csv(output_file_path)

        print(f"Processed {hotel_dir}, results saved to {output_file_path}")

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Rows with NaN values in 'Sentiment':
                                                 Opinion  \
3281   Love the location, the hospitality, the rooms,...   
3282   Love the location, the hospitality, the rooms,...   
3648                                           Good,Good   
3773   denial of my seniors discount ,I would rate th...   
3774   denial of my seniors discount ,I would rate th...   
...                                                  ...   
14014  about the only place/resort in the Philippines...   
14015  about the only place/resort in the Philippines...   
14016  about the only place/resort in the Philippines...   
17103  I adored this hotel, all areas are immaculate....   
17104  I adored this hotel, all areas are immaculate....   

                     Aspect  Sentiment  
3281               Location        NaN  
3282                   Food        NaN  
3648                   Food        NaN  
3773        Value for money        NaN  
3774                  Staff        Na

  0%|          | 0/3114 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.6397, 'grad_norm': 12.032791137695312, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.5689, 'grad_norm': 10.14920425415039, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 0.4376, 'grad_norm': 8.031575202941895, 'learning_rate': 3e-06, 'epoch': 0.03}
{'loss': 0.3353, 'grad_norm': 6.2437543869018555, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.2403, 'grad_norm': 2.8670852184295654, 'learning_rate': 5e-06, 'epoch': 0.05}
{'loss': 0.2896, 'grad_norm': 1.6874481439590454, 'learning_rate': 6e-06, 'epoch': 0.06}
{'loss': 0.1388, 'grad_norm': 1.885297179222107, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.07}
{'loss': 0.1778, 'grad_norm': 2.0060534477233887, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.08}
{'loss': 0.2262, 'grad_norm': 1.31308913230896, 'learning_rate': 9e-06, 'epoch': 0.09}
{'loss': 0.1672, 'grad_norm': 1.3743528127670288, 'learning_rate': 1e-05, 'epoch': 0.1}
{'loss': 0.229, 'grad_norm': 2.353

In [8]:
import pandas as pd
import os

# Define the output directory where sentiment results are saved
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

# Function to load and display sentiment results from all hotels
def load_and_display_sentiment_results(output_dir):
    for hotel_dir in os.listdir(output_dir):
        hotel_path = os.path.join(output_dir, hotel_dir)
        
        if os.path.isdir(hotel_path):  # Check if it's a directory
            for csv_file in os.listdir(hotel_path):
                if csv_file.endswith('_sentiment_analysis.csv'):
                    file_path = os.path.join(hotel_path, csv_file)
                    
                    # Load the CSV file into a DataFrame
                    df = pd.read_csv(file_path, index_col=0)
                    
                    # Display the DataFrame
                    print(f"Sentiment Analysis for {hotel_dir}:")
                    print(df)
                    print("\n" + "="*50 + "\n")

# Call the function to display all sentiment results
load_and_display_sentiment_results(output_dir)


Sentiment Analysis for 1_bai_hotel:
                 positive  negative  positive_percent  negative_percent
food               1127.0      30.0         27.902946          0.742758
staff              1708.0      53.0         42.287695          1.312206
comfort             303.0       7.0          7.501857          0.173310
facilities          180.0       3.0          4.456549          0.074276
value for money      53.0       1.0          1.312206          0.024759


Sentiment Analysis for 2_dusit_thani_mactan:
                 positive  negative  positive_percent  negative_percent
food                554.0      37.0         29.343220          1.959746
staff               905.0      40.0         47.934322          2.118644
comfort             117.0       6.0          6.197034          0.317797
facilities          143.0       6.0          7.574153          0.317797
value for money      12.0       1.0          0.635593          0.052966


Sentiment Analysis for 3_fairfield_by_marriott_cebu

# KPI Experiment

In [9]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Define your specific KPIs
specific_kpis = ['food', 'staff', 'comfort & facilities', 'value for money']

# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(reviews, top_n=10):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000, min_df=0.01, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()
    
    top_keywords = []
    for doc in tfidf_scores:
        sorted_indices = doc.argsort()[-top_n:]
        top_keywords.extend([feature_names[i] for i in sorted_indices])
    
    return Counter(top_keywords).most_common(top_n)

# Example usage with a single hotel's reviews
def get_additional_kpis(reviews):
    keywords = extract_keywords_tfidf(reviews, top_n=10)
    additional_kpis = [keyword for keyword, _ in keywords if keyword not in specific_kpis]
    return additional_kpis

# Directory paths
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs
        reviews = combined_df['Review Content'].tolist()
        additional_kpis = get_additional_kpis(reviews)
        
        print(f"Additional KPIs identified for {hotel_dir}: {additional_kpis}")


Additional KPIs identified for 1_bai_hotel: ['experience', 'expectations', 'extra', 'extremely', 'facilities', 'family', 'exceptional', 'worth', 'hotel', 'fantastic']
Additional KPIs identified for 2_dusit_thani_mactan: ['far', 'feel', 'felt', 'filipino', 'zee', 'fantastic', 'fine', 'hotel']
Additional KPIs identified for 3_fairfield_by_marriott_cebu: ['facilities', 'fairfield', 'face', 'extremely', 'extra', 'yes', 'fabulous', 'nice', 'room', 'great']
Additional KPIs identified for 4_jpark_island_resort_and_waterpark: ['facility', 'fact', 'facilities', 'families', 'family', 'young', 'fantastic', 'extra', 'felt', 'far']
Additional KPIs identified for 5_seda_ayala_center_cebu: ['extra', 'facilities', 'experience', 'family', 'fantastic', 'worth', 'far', 'hotel', 'fast', 'location']
Additional KPIs identified for 6_waterfront_hotel_and_casino: ['excellent', 'event', 'exceptional', 'especially', 'expensive', 'experience', 'years', 'extra', 'enjoyed', 'hotel']


In [10]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Function to preprocess reviews for LDA
def preprocess_for_lda(reviews):
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in review.lower().split() if word not in stop_words] for review in reviews]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus, dictionary

# Function to extract topics using LDA
def extract_topics_lda(corpus, dictionary, num_topics=5):
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=5)
    topic_keywords = []
    for topic in topics:
        words = topic[1].split(' + ')
        keywords = [word.split('*')[-1].replace('"', '').strip() for word in words]
        topic_keywords.extend(keywords)
    return list(set(topic_keywords))

# Example usage with a single hotel's reviews
def get_additional_kpis_lda(reviews):
    corpus, dictionary = preprocess_for_lda(reviews)
    topics = extract_topics_lda(corpus, dictionary, num_topics=5)
    additional_kpis = [topic for topic in topics if topic not in specific_kpis]
    return additional_kpis

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs using LDA
        reviews = combined_df['Review Content'].tolist()
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        
        print(f"Additional KPIs identified using LDA for {hotel_dir}: {additional_kpis_lda}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andyb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Additional KPIs identified using LDA for 1_bai_hotel: ['us', 'bai', 'hotel', 'even', 'customer', 'great', 'birthday', 'really', 'buffet', 'service', 'breakfast', 'room', 'good', 'thank']
Additional KPIs identified using LDA for 2_dusit_thani_mactan: ['us', 'hotel', 'great', '-', 'service', 'pool', 'room', 'good']
Additional KPIs identified using LDA for 3_fairfield_by_marriott_cebu: ['good', 'many', 'hotel', 'incredible', 'great', 'nice', 'bed', 'amazing', 'toilet', 'clean', 'beautiful', 'time', 'breakfast', 'would', 'room', 'rooms']
Additional KPIs identified using LDA for 4_jpark_island_resort_and_waterpark: ['resort', 'hotel', 'great', 'kids', '-', 'place', 'pool', 'room', 'good', 'water', '•']
Additional KPIs identified using LDA for 5_seda_ayala_center_cebu: ['hotel', 'location', 'seda', 'ayala', 'check', '-', 'mall', 'breakfast', 'room', 'stay', 'good']
Additional KPIs identified using LDA for 6_waterfront_hotel_and_casino: ['hotel', 'nice', 'great', 'old', 'check', 'breakfast', 

In [11]:
def combine_kpis(predefined_kpis, additional_kpis):
    return list(set(predefined_kpis + additional_kpis))

# Combine TF-IDF and LDA results with predefined KPIs
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        reviews = combined_df['Review Content'].tolist()

        # Extract additional KPIs using both TF-IDF and LDA
        additional_kpis_tfidf = get_additional_kpis(reviews)
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        additional_kpis = list(set(additional_kpis_tfidf + additional_kpis_lda))

        # Combine with predefined KPIs
        all_kpis = combine_kpis(specific_kpis, additional_kpis)

        print(f"All KPIs for {hotel_dir}: {all_kpis}")


All KPIs for 1_bai_hotel: ['food', 'customer', 'birthday', 'would', 'room', 'like', 'value for money', 'fantastic', 'comfort & facilities', 'experience', 'really', 'service', 'also', 'worth', 'extra', 'bai', 'staff', 'exceptional', 'extremely', 'cant', 'facilities', 'hotel', 'great', 'even', 'family', 'breakfast', 'us', 'good', 'expectations']
All KPIs for 2_dusit_thani_mactan: ['food', 'fine', 'room', 'stay', 'feel', 'filipino', 'value for money', 'fantastic', 'comfort & facilities', 'view', 'service', 'dusit', 'felt', 'staff', 'far', 'hotel', 'great', 'pool', 'good', 'zee']
All KPIs for 3_fairfield_by_marriott_cebu: ['food', 'nice', 'room', 'like', 'value for money', 'comfort & facilities', 'experience', 'really', 'everything', 'rooms', 'extra', 'fairfield', 'staff', 'fabulous', 'best', 'bed', 'extremely', 'facilities', 'yes', 'hotel', 'great', 'clean', 'face', 'breakfast', 'us', 'good']
All KPIs for 4_jpark_island_resort_and_waterpark: ['food', 'facility', 'families', 'room', 'value