# Preparing the data to be used for model fine tuning for consistent results

In [1]:
import pandas as pd
from textblob import TextBlob
import ast
import glob

# Load the dataset
aspect_path = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\aspect_based_dataset\*.csv"
files = glob.glob(aspect_path)

# Concatenate all CSV files into a single DataFrame
dataframes = [pd.read_csv(f) for f in files]
df = pd.concat(dataframes, ignore_index=True)

# Ensure all 'Classification' values are strings and handle missing values
df['Classification'] = df['Classification'].fillna('').astype(str)

def get_sentiment(opinion):
    analysis = TextBlob(opinion)
    # Determine the polarity (-1 is very negative, +1 is very positive)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

def label_aspects(row):
    sentiments = {}
    aspects = row['Classification'].split(':')
    for aspect in aspects:
        sentiment = get_sentiment(row['Opinion'])
        sentiments[aspect] = sentiment
    return sentiments

# Apply the labeling function to each row
df['Aspect_Sentiment'] = df.apply(label_aspects, axis=1)

# Check the first few entries in the 'Aspect_Sentiment' column
print("Aspect_Sentiment column before exploding:")
print(df['Aspect_Sentiment'].head())

# Convert 'Aspect_Sentiment' to actual dictionaries if they are strings
df['Aspect_Sentiment'] = df['Aspect_Sentiment'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Check again after conversion
print("Aspect_Sentiment column after conversion:")
print(df['Aspect_Sentiment'].head())

# Manually explode the dictionary into separate rows
rows = []

for i, row in df.iterrows():
    for aspect, sentiment in row['Aspect_Sentiment'].items():
        rows.append({
            'Opinion': row['Opinion'],
            'Aspect': aspect,
            'Sentiment': sentiment
        })

exploded_df = pd.DataFrame(rows)

# Check the final output
print("Exploded DataFrame:")
print(exploded_df.head())

# Save the labeled dataset
exploded_df.to_csv(r'C:\Users\andyb\Desktop\Coding Files\PointView\datasets\labeled_dataset.csv', index=False)


Aspect_Sentiment column before exploding:
0        {'Staff': 'Positive', 'Location': 'Positive'}
1    {'Food': 'Positive', 'Comfort & Facilities': '...
2         {'Location': 'Positive', 'Food': 'Positive'}
3    {'Comfort & Facilities': 'Positive', 'Food': '...
4    {'Comfort & Facilities': 'Positive', 'Food': '...
Name: Aspect_Sentiment, dtype: object
Aspect_Sentiment column after conversion:
0        {'Staff': 'Positive', 'Location': 'Positive'}
1    {'Food': 'Positive', 'Comfort & Facilities': '...
2         {'Location': 'Positive', 'Food': 'Positive'}
3    {'Comfort & Facilities': 'Positive', 'Food': '...
4    {'Comfort & Facilities': 'Positive', 'Food': '...
Name: Aspect_Sentiment, dtype: object
Exploded DataFrame:
                                             Opinion                Aspect  \
0  We stayed for a week and could not fault it at...                 Staff   
1  We stayed for a week and could not fault it at...              Location   
2  This resort is beautiful. The roo

# BERT

In [2]:
import torch

torch.cuda.is_available()

True

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.backends.cudnn as cudnn

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Ensure deterministic behavior
cudnn.deterministic = True
cudnn.benchmark = False

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the GPU
model.to(device)

# Set model to evaluation mode to disable dropout
model.eval()

# Define your specific KPIs
specific_kpis = ['food', 'staff', 'comfort', 'facilities', 'value for money']

# Load the labeled dataset (for fine-tuning)
labeled_data_path = r"C:\Users\andyb\Desktop\Coding Files\pointview\datasets\labeled_dataset.csv"
df = pd.read_csv(labeled_data_path)

# Convert the sentiment labels to integers
label_mapping = {'Positive': 1, 'Negative': 0}
df['label'] = df['label'].map(label_mapping)

# Identify rows with NaN values after mapping
nan_rows = df[df['label'].isna()]
print("Rows with NaN values in 'label':")
print(nan_rows)

# Handle NaN values
df = df.dropna(subset=['label'])  # Option 1: Drop rows with NaN values

# Ensure the labels are of type int
df['label'] = df['label'].astype(int)

# Inspect the label values and their data types
print("Unique values in label column:", df['label'].unique())
print("Data type of label column:", df['label'].dtype)

# Tokenize the dataset
train_encodings = tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=512)
train_labels = df['label'].tolist()  # These should now be integers

# Create a custom Dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset          # training dataset
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Move the model to the GPU
model.to(device)

# Set model to evaluation mode
model.eval()

# Base directory containing the hotel reviews
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Output directory to save the sentiment results
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to extract aspects from review content based on the KPIs
def extract_aspects(review, aspects_list):
    return [aspect for aspect in aspects_list if aspect.lower() in review.lower()]

# Check if the model is on the GPU
print(f"Model is on device: {next(model.parameters()).device}")

# When you're processing inputs:
def predict_sentiment(review, aspect):
    input_text = f"{aspect}: {review}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)  # Move inputs to GPU
    print(f"Inputs are on device: {inputs['input_ids'].device}")  # Check if the inputs are on GPU
    outputs = model(**inputs)
    sentiment = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if sentiment == 1 else "Negative"

# Loop through each hotel directory and process the combined data
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):  # Check if it's a directory
        combined_df = pd.DataFrame()  # Initialize an empty DataFrame to combine all files

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Perform the sentiment analysis on the combined DataFrame
        combined_df['Aspects'] = combined_df['Review Content'].apply(lambda x: extract_aspects(x, specific_kpis))
        combined_df['Aspects'] = combined_df['Aspects'].apply(lambda x: x if x else [])

        combined_df['Sentiment_Results'] = combined_df.apply(
            lambda row: {aspect: predict_sentiment(row['Review Content'], aspect) for aspect in row['Aspects']},
            axis=1
        )

        # Initialize dictionary to track positive/negative counts for each KPI
        aspect_sentiments = {aspect: {'positive': 0, 'negative': 0} for aspect in specific_kpis}

        # Count the positive and negative sentiments for each aspect
        for index, row in combined_df.iterrows():
            for aspect, sentiment in row['Sentiment_Results'].items():
                if sentiment == "Positive":
                    aspect_sentiments[aspect]['positive'] += 1
                else:
                    aspect_sentiments[aspect]['negative'] += 1

        # Calculate sentiment percentages for each aspect
        total_reviews = len(combined_df)
        for aspect, counts in aspect_sentiments.items():
            counts['positive_percent'] = (counts['positive'] / total_reviews) * 100
            counts['negative_percent'] = (counts['negative'] / total_reviews) * 100

        # Create a folder for the hotel in the output directory
        hotel_output_dir = os.path.join(output_dir, hotel_dir)
        if not os.path.exists(hotel_output_dir):
            os.makedirs(hotel_output_dir)

        # Save the sentiment analysis results to a CSV file
        output_file_path = os.path.join(hotel_output_dir, f"{hotel_dir}_sentiment_analysis.csv")
        output_df = pd.DataFrame(aspect_sentiments).T
        output_df.to_csv(output_file_path)

        print(f"Processed {hotel_dir}, results saved to {output_file_path}")

In [None]:
import pandas as pd
import os

# Define the output directory where sentiment results are saved
output_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\sentiment_results"

# Function to load and display sentiment results from all hotels
def load_and_display_sentiment_results(output_dir):
    for hotel_dir in os.listdir(output_dir):
        hotel_path = os.path.join(output_dir, hotel_dir)
        
        if os.path.isdir(hotel_path):  # Check if it's a directory
            for csv_file in os.listdir(hotel_path):
                if csv_file.endswith('_sentiment_analysis.csv'):
                    file_path = os.path.join(hotel_path, csv_file)
                    
                    # Load the CSV file into a DataFrame
                    df = pd.read_csv(file_path, index_col=0)
                    
                    # Display the DataFrame
                    print(f"Sentiment Analysis for {hotel_dir}:")
                    print(df)
                    print("\n" + "="*50 + "\n")

# Call the function to display all sentiment results
load_and_display_sentiment_results(output_dir)


# KPI Experiment

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Define your specific KPIs
specific_kpis = ['food', 'staff', 'comfort & facilities', 'value for money']

# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(reviews, top_n=10):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000, min_df=0.01, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()
    
    top_keywords = []
    for doc in tfidf_scores:
        sorted_indices = doc.argsort()[-top_n:]
        top_keywords.extend([feature_names[i] for i in sorted_indices])
    
    return Counter(top_keywords).most_common(top_n)

# Example usage with a single hotel's reviews
def get_additional_kpis(reviews):
    keywords = extract_keywords_tfidf(reviews, top_n=10)
    additional_kpis = [keyword for keyword, _ in keywords if keyword not in specific_kpis]
    return additional_kpis

# Directory paths
base_dir = r"C:\Users\andyb\Desktop\Coding Files\PointView\datasets\hotel_aspect_based_dataset"

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs
        reviews = combined_df['Review Content'].tolist()
        additional_kpis = get_additional_kpis(reviews)
        
        print(f"Additional KPIs identified for {hotel_dir}: {additional_kpis}")


In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Function to preprocess reviews for LDA
def preprocess_for_lda(reviews):
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in review.lower().split() if word not in stop_words] for review in reviews]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus, dictionary

# Function to extract topics using LDA
def extract_topics_lda(corpus, dictionary, num_topics=5):
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=5)
    topic_keywords = []
    for topic in topics:
        words = topic[1].split(' + ')
        keywords = [word.split('*')[-1].replace('"', '').strip() for word in words]
        topic_keywords.extend(keywords)
    return list(set(topic_keywords))

# Example usage with a single hotel's reviews
def get_additional_kpis_lda(reviews):
    corpus, dictionary = preprocess_for_lda(reviews)
    topics = extract_topics_lda(corpus, dictionary, num_topics=5)
    additional_kpis = [topic for topic in topics if topic not in specific_kpis]
    return additional_kpis

# Process each hotel's reviews
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        # Combine all CSV files within the hotel directory
        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        # Extract additional KPIs using LDA
        reviews = combined_df['Review Content'].tolist()
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        
        print(f"Additional KPIs identified using LDA for {hotel_dir}: {additional_kpis_lda}")


In [None]:
def combine_kpis(predefined_kpis, additional_kpis):
    return list(set(predefined_kpis + additional_kpis))

# Combine TF-IDF and LDA results with predefined KPIs
for hotel_dir in os.listdir(base_dir):
    hotel_path = os.path.join(base_dir, hotel_dir)
    
    if os.path.isdir(hotel_path):
        combined_df = pd.DataFrame()

        for csv_file in os.listdir(hotel_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(hotel_path, csv_file)
                temp_df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

        reviews = combined_df['Review Content'].tolist()

        # Extract additional KPIs using both TF-IDF and LDA
        additional_kpis_tfidf = get_additional_kpis(reviews)
        additional_kpis_lda = get_additional_kpis_lda(reviews)
        additional_kpis = list(set(additional_kpis_tfidf + additional_kpis_lda))

        # Combine with predefined KPIs
        all_kpis = combine_kpis(specific_kpis, additional_kpis)

        print(f"All KPIs for {hotel_dir}: {all_kpis}")
