In [None]:
#Installations
!pip install bertopic
!pip install transformers torch

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import AutoModelForCausalLM, pipeline
from google.colab import drive
from nltk.probability import FreqDist
from bertopic import BERTopic
from tabulate import tabulate
from collections import Counter

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# Mount Google Drive so Colab can access your files
drive.mount('/content/drive', force_remount=True)

In [None]:
# Define the full paths to the CSV files
google_path = '/content/drive/My Drive/Colab Notebooks/google_reviews_synthetic.csv'
trustpilot_path = '/content/drive/My Drive/Colab Notebooks/trustpilot_reviews_synthetic.csv'

# Load the reviews CSV files into DataFrames
google_df = pd.read_csv(google_path)
trustpilot_df = pd.read_csv(trustpilot_path)

In [None]:
print(google_df.columns)
print(trustpilot_df.columns)

In [None]:
# Remove rows with missing review text
google_df = google_df.dropna(subset=['Comment'])
trustpilot_df = trustpilot_df.dropna(subset=['Review Content'])

In [None]:
# Find the number of unique locations in the Google data set using "Club's Name"
num_unique_google_locations = google_df["Club's Name"].nunique()
print(f"Number of unique locations in the Google data set: {num_unique_google_locations}")

# Find the number of unique locations in the Trustpilot data set using "Location Name"
num_unique_trustpilot_locations = trustpilot_df["Location Name"].nunique()
print(f"Number of unique locations in the Trustpilot data set: {num_unique_trustpilot_locations}")

In [None]:
# Standardize the names by stripping whitespace and converting to lowercase
google_locations = set(google_df["Club's Name"].str.strip().str.lower().unique())
trustpilot_locations = set(trustpilot_df["Location Name"].str.strip().str.lower().unique())

# Find common locations
common_locations = google_locations.intersection(trustpilot_locations)

# Output the number of common locations
print(f"Number of common locations between Google and Trustpilot: {len(common_locations)}")

In [None]:
# Define the stopwords (from NLTK)
stop_words = set(stopwords.words('english'))

# Custom stopwords you want to remove manually
custom_stopwords = {'get', 'one', 'always', 'also', 'dont', 'even', 'like', 'use', 'would', 'im', 'go', 'ive', 'puregym', 'pure', 'really', 'need'}

# Update stopwords list to include custom stopwords
stop_words = stop_words.union(custom_stopwords)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove numbers and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and single characters
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    return tokens

In [None]:
# Apply to Google and Trustpilot reviews
google_df['clean_tokens'] = google_df['Comment'].apply(preprocess_text)
trustpilot_df['clean_tokens'] = trustpilot_df['Review Content'].apply(preprocess_text)

In [None]:
# Flatten the list of tokens for each dataset
google_words = [word for tokens in google_df['clean_tokens'] for word in tokens]
trustpilot_words = [word for tokens in trustpilot_df['clean_tokens'] for word in tokens]

# Frequency distribution
google_freq = FreqDist(google_words)
trustpilot_freq = FreqDist(trustpilot_words)

# Top 10 words
google_top10 = google_freq.most_common(10)
trustpilot_top10 = trustpilot_freq.most_common(10)

In [None]:
# Google
plt.figure(figsize=(8,4))
sns.barplot(x=[w[0] for w in google_top10], y=[w[1] for w in google_top10], palette='Blues_d')
plt.title('Top 10 Words in Google Reviews')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()

# Trustpilot
plt.figure(figsize=(8,4))
sns.barplot(x=[w[0] for w in trustpilot_top10], y=[w[1] for w in trustpilot_top10], palette='Greens_d')
plt.title('Top 10 Words in Trustpilot Reviews')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Join all tokens for wordcloud
google_text = ' '.join(google_words)
trustpilot_text = ' '.join(trustpilot_words)

# Google WordCloud
plt.figure(figsize=(8,6))
wc = WordCloud(width=800, height=400, background_color='white').generate(google_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Google Reviews Word Cloud')
plt.show()

# Trustpilot WordCloud
plt.figure(figsize=(8,6))
wc = WordCloud(width=800, height=400, background_color='white').generate(trustpilot_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Trustpilot Reviews Word Cloud')
plt.show()

In [None]:
google_df.to_csv('Google_12_months_cleaned.csv', index=False)
trustpilot_df.to_csv('Trustpilot_12_months_cleaned.csv', index=False)

In [None]:
# For Google reviews (negative if Overall Score < 3)
google_neg = google_df[google_df['Overall Score'] < 3].copy()

# For Trustpilot reviews (negative if Review Stars < 3)
trustpilot_neg = trustpilot_df[trustpilot_df['Review Stars'] < 3].copy()

In [None]:
# Calculate number and percentage of negative reviews
num_google_neg = len(google_neg)
num_trustpilot_neg = len(trustpilot_neg)
percent_google_neg = (num_google_neg / len(google_df)) * 100
percent_trustpilot_neg = (num_trustpilot_neg / len(trustpilot_df)) * 100

print(f"Number of negative Google reviews: {num_google_neg} ({percent_google_neg:.2f}%)")
print(f"Number of negative Trustpilot reviews: {num_trustpilot_neg} ({percent_trustpilot_neg:.2f}%)")

In [None]:
# Flatten the token list for negative reviews
google_neg_words = [word for tokens in google_neg['clean_tokens'] for word in tokens]
trustpilot_neg_words = [word for tokens in trustpilot_neg['clean_tokens'] for word in tokens]

# Frequency distribution
google_neg_freq = FreqDist(google_neg_words)
trustpilot_neg_freq = FreqDist(trustpilot_neg_words)

# Top 10 words
google_neg_top10 = google_neg_freq.most_common(10)
trustpilot_neg_top10 = trustpilot_neg_freq.most_common(10)

In [None]:
# Google negative reviews
plt.figure(figsize=(8,4))
sns.barplot(x=[w[0] for w in google_neg_top10], y=[w[1] for w in google_neg_top10], palette='Reds_d')
plt.title('Top 10 Words in Negative Google Reviews')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()

# Trustpilot negative reviews
plt.figure(figsize=(8,4))
sns.barplot(x=[w[0] for w in trustpilot_neg_top10], y=[w[1] for w in trustpilot_neg_top10], palette='Oranges_d')
plt.title('Top 10 Words in Negative Trustpilot Reviews')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Join all tokens for wordcloud
google_neg_text = ' '.join(google_neg_words)
trustpilot_neg_text = ' '.join(trustpilot_neg_words)

# Google Negative WordCloud
plt.figure(figsize=(8,6))
wc = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(google_neg_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Google Reviews Word Cloud')
plt.show()

# Trustpilot Negative WordCloud
plt.figure(figsize=(8,6))
wc = WordCloud(width=800, height=400, background_color='white', colormap='Oranges').generate(trustpilot_neg_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Trustpilot Reviews Word Cloud')
plt.show()

In [None]:
# Filter Negative Reviews for Common Locations

# Identify location columns
google_location_col = "Club's Name"
trustpilot_location_col = "Location Name"

# Standardize location columns for both negative review dataframes
google_neg['location_std'] = google_neg[google_location_col].str.strip().str.lower()
trustpilot_neg['location_std'] = trustpilot_neg[trustpilot_location_col].str.strip().str.lower()

# Find common locations using standardized names
common_locations = set(google_neg['location_std'].unique()) & set(trustpilot_neg['location_std'].unique())
print(f"Number of common locations: {len(common_locations)}")

# Filter negative reviews to only those from common locations (using standardized column)
google_neg_common = google_neg[google_neg['location_std'].isin(common_locations)]
trustpilot_neg_common = trustpilot_neg[trustpilot_neg['location_std'].isin(common_locations)]

In [None]:
google_neg_common['Source'] = 'Google'
trustpilot_neg_common['Source'] = 'Trustpilot'

# Unify the review columns
google_neg_common = google_neg_common.rename(columns={'Comment': 'Review'})
trustpilot_neg_common = trustpilot_neg_common.rename(columns={'Review Content': 'Review'})

merged_neg_reviews = pd.concat(
    [google_neg_common[['location_std', 'Review', 'Source']],
     trustpilot_neg_common[['location_std', 'Review', 'Source']]],
     ignore_index=True
)

In [None]:
#Save Merged Data for Later Use
merged_neg_reviews.to_csv('merged_neg_reviews.csv', index=False)

In [None]:
#Save Merged Data for Later Use
merged_neg_reviews.to_csv('merged_neg_reviews.csv', index=False)
Preprocess this data set for BERTopic.
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    return ' '.join(tokens)  # Re-join tokens into a string

# Apply to my merged DataFrame
merged_neg_reviews['cleaned_review'] = merged_neg_reviews['Review'].astype(str).apply(preprocess_text)

In [None]:
# List of cleaned reviews, ready for BERTopic modeling
all_neg_reviews_common_cleaned = merged_neg_reviews['cleaned_review'].tolist()

In [None]:
# Check the first few entries in the cleaned list
print(all_neg_reviews_common_cleaned[:10])  # Printing the first 10 elements to inspect


In [None]:
# Create and fit BERTopic model## Remove empty or whitespace-only reviews
all_neg_reviews_common_cleaned = [review for review in all_neg_reviews_common_cleaned if review.strip() != '']
topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(all_neg_reviews_common_cleaned)

In [None]:
# Get topic information as a DataFrame
topic_info = topic_model.get_topic_info()

# Exclude the outlier topic (-1), which is usually "no topic"
topic_info = topic_info[topic_info.Topic != -1]

# Display the top N topics (by frequency)
N = 10  # or however many you want to display
print("Top topics and their document frequencies:")
print(topic_info[['Topic', 'Count', 'Name']].head(N))

In [None]:
# Get the top 2 topics by frequency (excluding -1, which is usually "outlier" or "no topic")
top_topics = topic_info[topic_info.Topic != -1].head(2)

for idx, row in top_topics.iterrows():
    topic_num = row['Topic']
    top_words = [word for word, _ in topic_model.get_topic(topic_num)]
    print(f"Top words for topic {topic_num}: {', '.join(top_words[:10])}")

In [None]:
fig = topic_model.visualize_topics()
fig.show()

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
topic_model.visualize_heatmap()

In [None]:
# Show top 10 topics and their most representative words
for topic_id in topic_info['Topic'].head(10):
    if topic_id == -1:  # -1 is often the outlier/noise topic
        continue
    print(f"\nTopic {topic_id}:")
    words = topic_model.get_topic(topic_id)  # List of (word, weight)
    print("Top words:", ', '.join([w[0] for w in words[:8]]))
    # Optionally, print a few example documents from each topi_c:
    docs = [all_neg_reviews_common_cleaned[i] for i, t in enumerate(topics) if t == topic_id][:2]
    print("Example reviews:", docs)

# **4. Location Analysis**

In [None]:
# Location Analysis
# Find Top 20 Locations with the Most Negative Reviews in Each Dataset
# List out the top 20 locations with the highest number of negative reviews. Do this separately for Google and Trustpilot's reviews, and comment on the result.

# For Google
google_neg_location_counts = google_neg.groupby("Club's Name").size().sort_values(ascending=False)
google_top20_locs = google_neg_location_counts.head(20)
print("Top 20 Google locations with most negative reviews:\n", google_top20_locs)

In [None]:
# For Trustpilot
trustpilot_neg_location_counts = trustpilot_neg.groupby("Location Name").size().sort_values(ascending=False)
trustpilot_top20_locs = trustpilot_neg_location_counts.head(20)
print("Top 20 Trustpilot locations with most negative reviews:\n", trustpilot_top20_locs)

In [None]:
# Get the set of top 20 locations from each dataset
google_top20_set = set(google_top20_locs.index)
trustpilot_top20_set = set(trustpilot_top20_locs.index)

# Find the intersection (common locations)
common_locations = google_top20_set.intersection(trustpilot_top20_set)

# Calculate the number and percentage of common locations
num_common = len(common_locations)
total_possible = min(len(google_top20_set), len(trustpilot_top20_set))  # usually 20
percentage_common = (num_common / total_possible) * 100

# Display results
print(f"Common locations in both data sets ({num_common} out of {total_possible}, {percentage_common:.1f}%):")
for loc in common_locations:
    print(f"- {loc}")

In [None]:
# Merge the 2 data sets using Location Name and Club's Name.

# Now, list out the following:
# • Locations
# • Number of Trustpilot reviews for this location
# • Number of Google reviews for this location
# • Total number of reviews for this location (sum of Google reviews and Trustpilot reviews)
# Sort based on the total number of reviews.

# Extract top 20 locations from each dataset
google_top20_locs = google_neg.groupby("Club's Name").size().sort_values(ascending=False).head(20)
trustpilot_top20_locs = trustpilot_neg.groupby("Location Name").size().sort_values(ascending=False).head(20)

# Step 1: Create DataFrames for both Google and Trustpilot review counts
google_reviews = google_top20_locs.reset_index(name="Google Reviews")
google_reviews = google_reviews.rename(columns={"Club's Name": "Location Name"})  # Ensure consistent column name

trustpilot_reviews = trustpilot_top20_locs.reset_index(name="Trustpilot Reviews")
trustpilot_reviews = trustpilot_reviews.rename(columns={"Location Name": "Location Name"})

# Step 2: Merge the two DataFrames on 'Location Name', using outer join to include all unique locations
merged_reviews = pd.merge(google_reviews, trustpilot_reviews, on="Location Name", how="outer")

# Step 3: Fill NaN values with 0 (in case a location doesn't have reviews in either dataset)
merged_reviews = merged_reviews.fillna(0)

# Step 4: Calculate the total reviews (sum of Google and Trustpilot reviews)
merged_reviews["Total Reviews"] = merged_reviews["Google Reviews"] + merged_reviews["Trustpilot Reviews"]

# Step 5: Sort by total reviews (descending)
merged_reviews_sorted = merged_reviews.sort_values(by="Total Reviews", ascending=False)

# Step 6: Display the table in a nice format using tabulate
formatted_table = tabulate(
    merged_reviews_sorted[['Location Name', 'Trustpilot Reviews', 'Google Reviews', 'Total Reviews']].values,
    headers=['Location Name', 'Trustpilot Reviews', 'Google Reviews', 'Total Reviews'],
    tablefmt='grid'
)

# Print the formatted table
print(formatted_table)

In [None]:
# Save the merged and sorted DF
merged_reviews_sorted.to_csv("top_merged_location_reviews.csv", index=False)

In [None]:
# Get top 30 locations by total reviews
top30_locations = merged_reviews_sorted.head(30)

# Print just the locations
print("Top 30 Locations Based on Total Reviews:\n")
for location in top30_locations['Location Name']:
    print(location)

In [None]:
print(top30_locations.columns)

In [None]:
# Filtering the negative reviews from both Google and Trustpilot datasets for top 30 locations

def normalize_location(name):
    if pd.isna(name):
        return ""
    # Convert to string first, then strip and lowercase
    return str(name).strip().lower()

# dataframe column:
top30_locations['location_std'] = top30_locations['Location Name'].apply(normalize_location)

# Normalize location names in google_neg and trustpilot_neg
google_neg['location_std'] = google_neg["Club's Name"].apply(normalize_location)
trustpilot_neg['location_std'] = trustpilot_neg["Location Name"].apply(normalize_location)

# Get list of normalized top 30 locations
top30_locs_std = top30_locations['location_std'].unique().tolist()

# Filter negative reviews for top 30 locations using normalized column
google_neg_top30 = google_neg[google_neg['location_std'].isin(top30_locs_std)].copy()
trustpilot_neg_top30 = trustpilot_neg[trustpilot_neg['location_std'].isin(top30_locs_std)].copy()

# Rename columns for consistency
google_neg_top30 = google_neg_top30.rename(columns={"Club's Name": "Location", "Comment": "Review"})
trustpilot_neg_top30 = trustpilot_neg_top30.rename(columns={"Location Name": "Location", "Review Content": "Review"})

# Add source columns
google_neg_top30['Source'] = 'Google'
trustpilot_neg_top30['Source'] = 'Trustpilot'

# Select only needed columns
google_subset = google_neg_top30[['Location', 'Review', 'Source']]
trustpilot_subset = trustpilot_neg_top30[['Location', 'Review', 'Source']]

# Combine datasets
combined_neg_reviews = pd.concat([google_subset, trustpilot_subset], ignore_index=True)

print(f"Combined negative reviews count: {combined_neg_reviews.shape[0]}")
print(combined_neg_reviews.head())

In [None]:
print(combined_neg_reviews['Source'].value_counts())

In [None]:
# For the top 30 locations, redo the word frequency and word cloud. Comment on the results, and highlight
# if the results are different from the first run.

# Combine all reviews into a single string (lowercase, remove punctuation)
all_reviews = " ".join(combined_neg_reviews['Review'].dropna().astype(str)).lower()

# Remove punctuation and digits
all_reviews_clean = re.sub(r'[^a-z\s]', '', all_reviews)

# Define the stopwords (from NLTK)
stop_words = set(stopwords.words('english'))

# Custom stopwords you want to remove manually
custom_stopwords = {'get', 'one', 'always', 'also', 'dont', 'even', 'like', 'use', 'would', 'im', 'go', 'ive', 'puregym', 'pure'}

# Update stopwords list to include custom stopwords
combined_stopwords = stop_words.union(custom_stopwords)

# Tokenize and remove stopwords
tokens = [word for word in all_reviews_clean.split() if word not in combined_stopwords]

# Count word frequencies
word_freq = Counter(tokens)

# Display top 30 words
top_10_words = word_freq.most_common(10)
print("Top 10 most common words in negative reviews (top 30 locations):")
for word, freq in top_10_words:
    print(f"{word}: {freq}")

In [None]:
# Unpack words and frequencies from top_30_words
words = [w[0] for w in top_10_words]
freqs = [w[1] for w in top_10_words]

# Plot barplot (histogram-like)
plt.figure(figsize=(12,6))
sns.barplot(x=words, y=freqs, palette='Purples_d')
plt.title('Top 10 Words in Negative Reviews (Top 30 Locations)')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create a WordCloud from the top 10 words (we only use words and frequencies)
top_10_dict = dict(top_10_words)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_10_dict)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.title('Top 10 Words in Negative Reviews')
plt.show()

In [None]:
# Redo Topic Modelling with BERTopic for top 30

# Clean each review individually
cleaned_reviews = []

# Define the stopwords (from NLTK)
stop_words = set(stopwords.words('english'))

# Custom stopwords you want to remove manually
custom_stopwords = {'get', 'one', 'always', 'also', 'dont', 'even', 'like', 'use', 'would', 'im', 'go', 'ive', 'puregym', 'pure'}

# Update stopwords list to include custom stopwords
combined_stopwords = stop_words.union(custom_stopwords)

# Iterate over each review and clean it
for review in combined_neg_reviews['Review'].dropna():
    # Convert to lowercase and remove non-alphabetic characters
    review_clean = re.sub(r'[^a-z\s]', '', review.lower())

    # Tokenize and remove stopwords
    tokens = [word for word in review_clean.split() if word not in combined_stopwords]

    # Join tokens back into a single cleaned review string
    cleaned_reviews.append(' '.join(tokens))

# Initialize the BERTopic model
topic_model_neg_reviews = BERTopic(language="english", verbose=True)

# Fit the model to the cleaned reviews and get topics and probabilities
topics_neg_reviews, probs_neg_reviews = topic_model_neg_reviews.fit_transform(cleaned_reviews)

In [None]:
# Get topic information as a DataFrame
topic_info = topic_model_neg_reviews.get_topic_info()

# Exclude the outlier topic (-1), which is usually "no topic"
topic_info = topic_info[topic_info.Topic != -1]

# Display the top N topics (by frequency)
N = 10  # or however many you want to display
print("Top topics and their document frequencies:")
print(topic_info[['Topic', 'Count', 'Name']].head(N))

In [None]:
# Bar Chart: Top Words per Topic for top 30
topic_model_neg_reviews.visualize_barchart(top_n_topics=5)

In [None]:
# Heatmap: Topic Similarities for top 30
topic_model_neg_reviews.visualize_topics()
topic_model_neg_reviews.visualize_barchart(top_n_topics=10)
topic_model_neg_reviews.visualize_heatmap()

In [None]:
# ----------Emotion Analysis--------------
# Note: nateraw/bert-base-uncased-emotion model predicts emotions anger, disgust, fear, joy, sadness and surprise (six in total).
# Use BERT Emotion Classifier to Analyze Negative Reviews
# Import the BERT model bhadresh-savani/bert-base-uncased-emotion from Hugging Face, and set up a pipeline for text classification.

# Set up the Hugging Face pipeline for emotion classification
emotion_classifier = pipeline("text-classification", model="nateraw/bert-base-uncased-emotion", tokenizer="nateraw/bert-base-uncased-emotion")

In [None]:
# Example sentence
text = "Every time I see my dog, my heart fills with love and happiness."

# Get emotion prediction for the sentence
emotion_result = emotion_classifier(text)

# Print the classification result
print(emotion_result)

In [None]:
# Run this model on both data sets

from transformers import pipeline

# Set up the Hugging Face pipeline for emotion classification
emotion_classifier = pipeline(
    "text-classification",
    model="nateraw/bert-base-uncased-emotion",
    tokenizer="nateraw/bert-base-uncased-emotion",
    device=0  # Set to -1 for CPU, 0 for GPU
)

# Modify the function to handle truncation and padding explicitly
def get_top_emotion(review):
    # Tokenize with truncation and padding
    result = emotion_classifier(review, padding=True, truncation=True, max_length=512)
    # Return the predicted label (emotion)
    return result[0]['label']

# Sampling a smaller portion of the data
sample_size = 1000  # To reduce runtime; can be changed

# Sample 100 random rows from the Google reviews dataframe
google_sample = google_df.sample(n=sample_size, random_state=42)

# Sample 100 random rows from the Trustpilot reviews dataframe
trustpilot_sample = trustpilot_df.sample(n=sample_size, random_state=42)

# Apply the emotion prediction to the sampled Google and Trustpilot reviews
google_sample['predicted_emotion'] = google_sample['Comment'].apply(get_top_emotion)
trustpilot_sample['predicted_emotion'] = trustpilot_sample['Review Content'].apply(get_top_emotion)

# Print the results for the sample
print("Sampled Google Reviews Emotions:")
print(google_sample[['Comment', 'predicted_emotion']])

print("\nSampled Trustpilot Reviews Emotions:")
print(trustpilot_sample[['Review Content', 'predicted_emotion']])

In [None]:
# Capture the top emotion for each review

# Find the top 3 most common emotions in the sampled reviews
google_top_3_emotions = google_sample['predicted_emotion'].value_counts().head(3)
trustpilot_top_3_emotions = trustpilot_sample['predicted_emotion'].value_counts().head(3)

print("\nTop 3 Emotions in Google Reviews Sample:")
print(google_top_3_emotions)

print("\nTop 3 Emotions in Trustpilot Reviews Sample:")
print(trustpilot_top_3_emotions)

In [None]:
# Use a bar plot to show the top emotion distribution for all negative reviews in both data sets.

# Define a list of negative emotions
negative_emotions = ['sadness', 'anger', 'fear']

# Filter the Google reviews to keep only negative emotions
google_negative = google_sample[google_sample['predicted_emotion'].isin(negative_emotions)]

# Filter the Trustpilot reviews to keep only negative emotions
trustpilot_negative = trustpilot_sample[trustpilot_sample['predicted_emotion'].isin(negative_emotions)]

# Combine the data for both datasets
combined_negative_emotions = pd.concat([google_negative['predicted_emotion'], trustpilot_negative['predicted_emotion']], axis=0)

# Plot the distribution of negative emotions
plt.figure(figsize=(10, 6))
sns.countplot(x=combined_negative_emotions, palette='coolwarm')

# Set plot title and labels
plt.title('Distribution of Negative Emotions in Google and Trustpilot Reviews', fontsize=14)
plt.xlabel('Negative Emotions', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)

# Show plot
plt.show()

In [None]:
# Below is to check if the bar chart makes sense

# Count the occurrences of each emotion in the sampled data for Google and Trustpilot
google_emotion_counts = google_sample['predicted_emotion'].value_counts()
trustpilot_emotion_counts = trustpilot_sample['predicted_emotion'].value_counts()

# Print the distribution of emotions in the samples
print("Emotion distribution in Google Reviews Sample:")
print(google_emotion_counts)

print("\nEmotion distribution in Trustpilot Reviews Sample:")
print(trustpilot_emotion_counts)


In [None]:
# Extract all the negative reviews (from both data sets) where anger is top emotion.

# Extract reviews from Google where 'Anger' is the predicted emotion
google_anger_reviews = google_sample[google_sample['predicted_emotion'] == 'anger']

# Extract reviews from Trustpilot where 'Anger' is the predicted emotion
trustpilot_anger_reviews = trustpilot_sample[trustpilot_sample['predicted_emotion'] == 'anger']

# Print the reviews with 'Anger' emotion for both datasets
print("Google Reviews with 'Anger' Emotion:")
print(google_anger_reviews[['Comment', 'predicted_emotion']])

print("\nTrustpilot Reviews with 'Anger' Emotion:")
print(trustpilot_anger_reviews[['Review Content', 'predicted_emotion']])

In [None]:
# Run BERTopic on the output of the previous step.

def preprocess_text(text):
 text = text.lower()
 text = re.sub(r'\d+', '', text)
 text = text.translate(str.maketrans('', '', string.punctuation))
 tokens = word_tokenize(text)
 tokens = [word for word in tokens if word not in stop_words and len(word) >1]
 return " ".join(tokens)

# Apply preprocessing to the anger reviews
google_anger_reviews['cleaned_review'] = google_anger_reviews['Comment'].apply(preprocess_text)
trustpilot_anger_reviews['cleaned_review'] = trustpilot_anger_reviews['Review Content'].apply(preprocess_text)

# Combine both datasets for topic modeling
all_anger_reviews = pd.concat([google_anger_reviews['cleaned_review'], trustpilot_anger_reviews['cleaned_review']], axis=0).tolist()

# Remove any empty strings from the list
all_anger_reviews = [review for review in all_anger_reviews if review.strip() != '']

# Set up the BERTopic model with a minimum number of topics
topic_model = BERTopic(language="english", verbose=True, min_topic_size=5)

# Fit the model to the anger reviews
topics, probs = topic_model.fit_transform(all_anger_reviews)

print(topic_info[['Topic', 'Count', 'Name']].head(10))

In [None]:
# Visualize the top 5 topics as a bar chart
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
print(len(all_neg_reviews_common_cleaned))

In [None]:
print(topics[:20])  # See the first 20 topic assignments
print(set(topics))  # See all unique topics assigned

In [None]:
# Visualize topics with a heatmap
topic_model.visualize_topics()
topic_model.visualize_barchart(top_n_topics=10)
topic_model.visualize_heatmap()