In [None]:
# Change to GPU for parallel processing.
# Here, for the below code to return true, I choose Runtime → Change runtime type → T4 GPU

import torch
print(torch.cuda.is_available())  # Should be true

import torch
torch.cuda.empty_cache()

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [None]:
# Install packages
!pip install -U transformers accelerate bitsandbytes huggingface_hub
!pip install umap-learn
!pip install googletrans==4.0.0-rc1
!pip install gensim pyldavis
!pip uninstall -y openai httpx
!pip install httpx==0.24.1
!pip install openai==0.28
!pip install bertopic[hdbscan]
!pip uninstall -y openai
!pip install bertopic[hdbscan] sentence-transformers
!pip install bertopic[hdbscan]

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import AutoModelForCausalLM, pipeline

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# Load data from source

# Import the drive module from Colab
from google.colab import drive

# Mount Google Drive so Colab can access your files
drive.mount('/content/drive', force_remount=True)

In [None]:
# Define the full paths to the excel files
google_path = '/content/drive/My Drive/Colab Notebooks/google_reviews_synthetic.csv'
trustpilot_path = '/content/drive/My Drive/Colab Notebooks/trustpilot_reviews_synthetic.csv'

# Load the reviews CSV files into DataFrames
google_df = pd.read_csv(google_path)
trustpilot_df = pd.read_csv(trustpilot_path)

In [None]:
print(google_df.columns)
print(trustpilot_df.columns)

In [None]:
# Data cleaning
# Remove rows with missing review text
google_df = google_df.dropna(subset=['Comment'])
trustpilot_df = trustpilot_df.dropna(subset=['Review Content'])

In [None]:
# ----------------Preprocessing: lowercase, remove stopwords, remove numbers-------------

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove numbers and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and single characters
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    return tokens

In [None]:
# Apply to Google and Trustpilot reviews
google_df['clean_tokens'] = google_df['Comment'].apply(preprocess_text)
trustpilot_df['clean_tokens'] = trustpilot_df['Review Content'].apply(preprocess_text)

In [None]:
# Identify Negative Reviews

# For Google reviews (negative if Overall Score < 3)
google_neg = google_df[google_df['Overall Score'] < 3].copy()

# For Trustpilot reviews (negative if Review Stars < 3)
trustpilot_neg = trustpilot_df[trustpilot_df['Review Stars'] < 3].copy()

In [None]:
# Safe results using pickle

import pickle

# Define paths to save the pickle files in your Google Drive
google_neg_path = '/content/drive/My Drive/Colab Notebooks/google_neg_reviews.pkl'
trustpilot_neg_path = '/content/drive/My Drive/Colab Notebooks/trustpilot_neg_reviews.pkl'

# Save Google negative reviews
with open(google_neg_path, 'wb') as f:
    pickle.dump(google_neg, f)

# Save Trustpilot negative reviews
with open(trustpilot_neg_path, 'wb') as f:
    pickle.dump(trustpilot_neg, f)

In [None]:
import pickle

# Define paths to save the pickle files in your Google Drive
google_neg_path = '/content/drive/My Drive/Colab Notebooks/google_neg_reviews.pkl'
trustpilot_neg_path = '/content/drive/My Drive/Colab Notebooks/trustpilot_neg_reviews.pkl'

# Load (unpickle) the DataFrames
with open(google_neg_path, 'rb') as f:
    google_neg = pickle.load(f)

with open(trustpilot_neg_path, 'rb') as f:
    trustpilot_neg = pickle.load(f)

In [None]:
# Load the following model: tiiuae/falcon-7b-instruct. Set the pipeline for text generation and a max length of 1,000 for each review.

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

# Load model and tokenizer
model_name = "tiiuae/falcon-7b-instruct"

# Configuration for 8-bit quantization (for better memory usage)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True  # Offload some parts to CPU automatically
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with the specified configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically choose the best device (GPU/CPU)
    trust_remote_code=True
)

In [None]:
!pip install langdetect

In [None]:
# Extract Main Topics from Each Negative Review (Prompted with Falcon-7b-Instruct)

import re
from collections import Counter
import pandas as pd
from langdetect import detect, LangDetectException
from tqdm import tqdm

# -------------------------------
# Topic normalization dictionary
# -------------------------------
topic_normalization = {
    # Cleanliness & Hygiene
    "dirty": "Hospital Cleanliness",
    "hygiene": "Hospital Cleanliness",
    "sanitary": "Hospital Cleanliness",
    "filthy": "Hospital Cleanliness",
    "infection": "Hospital Cleanliness",
    "sterile": "Hospital Cleanliness",
    "disinfection": "Hospital Cleanliness",

    # Parking & Access
    "parking": "Parking Issues",
    "car park": "Parking Issues",
    "space": "Parking Issues",
    "access": "Access Issues",
    "wheelchair": "Accessibility",
    "ramp": "Accessibility",

    # Pricing & Billing
    "price": "Pricing",
    "charges": "Pricing",
    "expensive": "Pricing",
    "cost": "Pricing",
    "billing": "Billing Issues",
    "insurance": "Billing Issues",

    # Staff Behavior & Communication
    "nurse": "Nurse Behavior",
    "doctor": "Doctor Behavior",
    "rude": "Staff Rudeness",
    "friendly": "Staff Friendliness",
    "unprofessional": "Staff Rudeness",
    "communication": "Communication Quality",
    "explained": "Communication Quality",
    "updates": "Communication Quality",

    # Waiting Times
    "waiting": "Waiting Times",
    "delay": "Waiting Times",
    "queue": "Waiting Times",
    "appointment": "Waiting Times",

    # Treatment & Care Quality
    "treatment": "Treatment Quality",
    "procedure": "Treatment Quality",
    "operation": "Treatment Quality",
    "surgery": "Treatment Quality",
    "specialist": "Doctor Specialty",
    "expert": "Doctor Specialty",
    "cardiology": "Doctor Specialty",
    "orthopedic": "Doctor Specialty",

    # Facilities & Equipment
    "equipment": "Medical Equipment",
    "machines": "Medical Equipment",
    "facility": "Hospital Facilities",
    "room": "Hospital Facilities",
    "bed": "Hospital Facilities",

    # General Experience
    "experience": "Overall Experience",
    "service": "Overall Experience",
    "care": "Overall Experience"
}

# -------------------------------
# Functions
# -------------------------------
def normalize_topic(topic):
    topic = re.sub(r'^\s*\d+\.\s*', '', topic).strip()
    for key, normalized in topic_normalization.items():
        if key.lower() in topic.lower():
            return normalized
    return topic

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def extract_and_normalize_topics(main_topic_text):
    raw_extracted_topics = re.findall(r'^\s*\d+\.\s*(.+)', main_topic_text, re.MULTILINE)

    if not raw_extracted_topics:
        lines = [line.strip() for line in main_topic_text.split('\n') if line.strip()]
        raw_extracted_topics = [
            re.sub(r'^(Topic\\s*\\d+\\s*:|^\\d+\\.\\s*|^\\s*-\\s*)*', '', line, flags=re.IGNORECASE).strip()
            for line in lines
        ]
        raw_extracted_topics = [t for t in raw_extracted_topics if t]

    normalized_topics_list = []
    for topic_text in raw_extracted_topics:
        normalized_topic = normalize_topic(topic_text)
        if normalized_topic:
            normalized_topics_list.append(normalized_topic)

    return normalized_topics_list[:3]  # Top 3

# -------------------------------
# Load pipeline for text generation
# -------------------------------
text_gen = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    device_map="auto"
)

# -------------------------------
# Prepare data
# -------------------------------
# Assuming 'google_neg' DataFrame exists with a 'Comment' column
english_reviews = google_neg[google_neg['Comment'].apply(is_english)]
subset_size = 50
google_subset = english_reviews.iloc[:subset_size].copy()

google_prompts = [
    f"In the following customer review, pick out the main 3 topics. Return them in a numbered list format, each on a new line.\n\nReview: {review}\nMain topics:"
    for review in google_subset['Comment']
]

# -------------------------------
# Generate topics using pipeline
# -------------------------------
google_results = []
for prompt in tqdm(google_prompts, desc="Generating topics"):
    result = text_gen(prompt, max_new_tokens=64, temperature=0.7)
    google_results.append(result[0]['generated_text'])

# -------------------------------
# Extract and normalize topics
# -------------------------------
google_subset['main_topic_raw'] = [
    text.split("Main topics:")[-1].strip() for text in google_results
]
google_subset['topics_list'] = google_subset['main_topic_raw'].apply(extract_and_normalize_topics)

# -------------------------------
# Count topics
# -------------------------------
all_topics = [topic for sublist in google_subset['topics_list'] for topic in sublist]
topic_counts = Counter(all_topics)
top_3_overall_topics = topic_counts.most_common(3)

# -------------------------------
# Display results
# -------------------------------
print("---")
print("## Overall Top 3 Most Common Topics")
for topic, count in top_3_overall_topics:
    print(f"- **{topic}**: {count} reviews")

print("\n---")
print("## Reviews with Extracted and Limited Topics")
for index, row in google_subset.iterrows():
    print(f"**Review {index+1}:** {row['Comment']}")
    print("  **Extracted Topics (Max 3):**")
    if row['topics_list']:
        for i, topic in enumerate(row['topics_list']):
            print(f"  {i+1}. {topic}")
    else:
        print("  No main topics extracted.")
    print("-" * 40)

In [None]:
#The output of the model will be the top 3 topics from each review. Append each of these topics from each review to create a comprehensive list.

all_topics = [topic for sublist in google_subset['topics_list'] for topic in sublist]
print(all_topics[:15]) # Adjust 15 to any number you prefer, or remove [:15] for the whole list

In [None]:
# Use this list as input to run BERTopic again.

from bertopic import BERTopic
from collections import Counter
import pandas as pd

bertopic_model = BERTopic(language="english", verbose=True,
                          min_topic_size=5, nr_topics="auto", n_gram_range=(1, 2))

# Fit BERTopic on your 'all_topics' list
# The 'meta_topics' output here refers to the assigned meta-topic ID for each item in 'all_topics'
meta_topics, probabilities = bertopic_model.fit_transform(all_topics)

# Get information about the generated meta-topics
meta_topic_info = bertopic_model.get_topic_info()

print(meta_topic_info)

print("\n--- Top words for each BERTopic Meta-Topic ---")
for topic_id in meta_topic_info['Topic'].unique():
    if topic_id != -1: # -1 usually represents outliers/noise
        print(f"\nMeta-Topic {topic_id}: {bertopic_model.get_topic(topic_id)}")

In [None]:
# Visualize the top 5 topics as a bar chart
fig = bertopic_model.visualize_barchart(top_n_topics=5)

# Adjust the figure size
fig.update_layout(
    autosize=False,
    width= 1400,  # Adjust width for more space
    height=600  # Adjust height for more space
)

# Rotate x-axis labels to prevent overlap
fig.update_layout(
    xaxis_tickangle=-45  # Rotate labels by 45 degrees to avoid overlap
)

# Show the plot
fig.show()

In [None]:
# Visualize topics with a heatmap
bertopic_model.visualize_heatmap()

In [None]:
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="tiiuae/falcon-7b-instruct",
    device_map="auto"
)

if not all_topics:
    print("The 'all_topics' list is empty. Cannot generate insights.")
else:
    print("\nGenerating Actionable Insights from all_topics")

    # Join the topics into a single string for the prompt
    topics_for_insight_prompt = ", ".join(all_topics)

    # Construct the full prompt as specified
    actionable_insight_prompt = (
        "For the following text topics obtained from negative customer reviews, "
        "can you give some actionable insights that would help this gym company?\n\n"
        f"Topics: {topics_for_insight_prompt}\n\n"
        "Actionable Insights:" # This phrase helps guide the LLM's output
    )

    print(f"Prompt sent to Falcon model (truncated for display):\n{actionable_insight_prompt[:500]}...\n")

    # Run the Falcon-7b-Instruct model to generate insights
    # Adjust max_new_tokens for the desired length of insights.
    # Adjust temperature for creativity (higher = more creative).

    insight_results = generator(
        actionable_insight_prompt,
        max_new_tokens=300, # Example: 300 tokens for comprehensive insights
        do_sample=True,
        temperature=0.7,
        num_return_sequences=1
    )

    # Extract the generated insights from the model's output
    generated_insights = insight_results[0]['generated_text'].split("Actionable Insights:")[-1].strip()

    print("\nGenerated Actionable Insights from Falcon Model")
    print(generated_insights)

In [None]:
# Gensim LDA Comparison

!pip install gensim

In [None]:
import pickle

# Define paths to save the pickle files in your Google Drive
google_neg_path = '/content/drive/My Drive/Colab Notebooks/google_neg_reviews.pkl'
trustpilot_neg_path = '/content/drive/My Drive/Colab Notebooks/trustpilot_neg_reviews.pkl'

# Load (unpickle) the DataFrames
with open(google_neg_path, 'rb') as f:
    google_neg = pickle.load(f)

with open(trustpilot_neg_path, 'rb') as f:
    trustpilot_neg = pickle.load(f)

In [None]:
# Check the structure of the first few rows from both DataFrames
print(google_neg['clean_tokens'].head())
print(trustpilot_neg['clean_tokens'].head())

In [None]:
# Perform the preprocessing required to run the LDA model from Gensim. Use the list of negative reviews (combined Google and Trustpilot reviews).

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora

import nltk
nltk.download('wordnet')
nltk.download('punkt')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))  # List of stopwords in English
lemmatizer = WordNetLemmatizer()

# Preprocess each review (already tokenized)
def preprocess(tokens):
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Apply preprocessing to all Google reviews
google_preprocessed = [preprocess(tokens) for tokens in google_neg['clean_tokens'].tolist()]

# Apply preprocessing to all Trustpilot reviews
trustpilot_preprocessed = [preprocess(tokens) for tokens in trustpilot_neg['clean_tokens'].tolist()]

# Combine the preprocessed Google and Trustpilot reviews
preprocessed_texts = google_preprocessed + trustpilot_preprocessed

# Remove rare words (appear in fewer than 3 documents)
# Create dictionary
id2word = corpora.Dictionary(preprocessed_texts)

# Filter tokens that appear in fewer than 3 documents or more than 50% of the documents
id2word.filter_extremes(no_below=3, no_above=0.5)

# Create corpus
corpus = [id2word.doc2bow(text) for text in preprocessed_texts]

In [None]:
# Using Gensim, perform LDA on the tokenised data. Specify the number of topics = 10.

from gensim.models import LdaModel
import numpy as np

# Number of topics
num_topics = 10

# Train the LDA model using the preprocessed corpus and dictionary
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=15,  # Number of passes over the entire corpus (higher = better)
    iterations=400,  # Number of iterations for each pass
    eval_every=None  # Disable evaluation during training to save time
)

# Display the topics and their top words
topics = lda_model.print_topics(num_words=10)  # Show top 10 words for each topic
for topic in topics:
    print(topic)

In [None]:
!pip install pyLDAvis

In [None]:
# Show the visualisations of the topics, displaying the distance maps and the bar chart listing out the most salient terms.

import pyLDAvis.gensim_models
import pyLDAvis

# Visualizing the LDA topics using pyLDAvis
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# Display the visualization
pyLDAvis.display(lda_visualization)