In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import re
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function for text preprocessing
def preprocess_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = str(text).lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [3]:

# Load the Excel files into pandas DataFrames
df_booking = pd.read_excel("V1BookingHotel_-_TH_SG.xlsx")
df_ai_reviews = pd.read_excel("reviewsAIV1.xlsx")



In [4]:
# Preprocess text data in both datasets
df_booking["Text"] = df_booking["Text"].apply(preprocess_text)
df_ai_reviews["Reviews"] = df_ai_reviews["Reviews"].apply(preprocess_text)

In [5]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Preprocess and tokenize AI-generated reviews
ai_reviews = df_ai_reviews["Reviews"].tolist()
ai_reviews_tokenized = tokenizer(ai_reviews, padding=True, truncation=True, return_tensors="pt")

# Obtain BERT embeddings for AI-generated reviews
with torch.no_grad():
    ai_reviews_outputs = model(**ai_reviews_tokenized)
    ai_reviews_embeddings = ai_reviews_outputs.last_hidden_state.mean(dim=1)



In [6]:
# Function to calculate contextual similarity scores
def calculate_similarity_scores(text):
    if not isinstance(text, str):
        text = str(text)
    # Tokenize and encode the input text
    text_tokenized = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # Obtain BERT embeddings for the input text
    with torch.no_grad():
        text_outputs = model(**text_tokenized)
        text_embedding = text_outputs.last_hidden_state.mean(dim=1)
    
    # Calculate cosine similarity between the input text and AI-generated reviews
    similarity_scores = cosine_similarity(text_embedding, ai_reviews_embeddings)
    
    return similarity_scores

In [7]:

# Iterate through each review in df_booking and calculate similarity scores
similarity_scores_list = []
average_similarity_scores = []
for idx, row in df_booking.iterrows():
    text = row["Text"]
    similarity_scores = calculate_similarity_scores(text)
    similarity_scores_list.append(similarity_scores)
    average_similarity_score = similarity_scores.mean()
    average_similarity_scores.append(average_similarity_score)


In [8]:
# Add similarity scores to df_booking
#df_booking["Similarity Scores"] = similarity_scores_list

In [9]:
# Add average similarity scores to df_booking
df_booking["Average Similarity Score"] = average_similarity_scores

In [10]:
# Save the updated DataFrame with similarity scores
df_booking.to_excel("V1BookingHotel_with_AvgSimilarity_Scores.xlsx", index=False)