# Importing

In [None]:
import pandas as pd
from google.colab import drive
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.probability import FreqDist

import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

nltk.download('all', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

drive.mount('/content/drive')
print(f'Libraries imported')

In [None]:
def extract(file_path: str) -> pd.DataFrame:
    df = pd.read_excel(file_path)
    print(f"Loaded {file_path}: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

In [None]:
def rename_and_select(df, mappings):
    df = df.rename(columns=mappings)
    cols = list(mappings.values())
    return df[cols].copy()

In [None]:
def cast_types(df, schema):
    for col, dtype in schema.items():
        if dtype == "datetime":
            df[col] = pd.to_datetime(df[col], errors="coerce")
        else:
            df[col] = df[col].astype(dtype, errors="ignore")
    return df

In [None]:
def filter_rows(df, max_score):
    return df[df["score"] <= max_score].copy()

In [None]:
def transform(df, mappings, max_score):
    # 1. Rename + keep expected columns
    df = rename_and_select(df, mappings)

    # 2. Remove rows without reviews
    df = df.dropna(subset=["review"])

    # 3. Type casting
    df = cast_types(df, schema)

    # 4. Drop duplicates
    df = df.drop_duplicates()

    # 5. Rating filtering
    df = filter_rows(df, max_score)
    df = df[df['review'].notna()].copy()
    return df

In [None]:
def combine_datasets(dfs):
    combined = pd.concat(dfs, ignore_index=True)
    return combined

In [None]:
google_rename_mappings = {
            "Social Media Source": "source",
            "Club's Name": "location",
            "Creation Date": "date_created",
            "Comment": "review",
            "Overall Score": "score",
        }


trustpilot_rename_mappings = {
            "Source Of Review": "source",
            "Location Name": "location",
            "Review Created (UTC)": "date_created",
            "Review Content": "review",
            "Review Stars": "score",
        }

schema = {
        "source": "string",
        "location": "string",
        "date_created": "datetime",
        "review": "string",
        "score": "int64",
    }

max_score = 3

In [None]:
google_df = extract('/content/drive/MyDrive/DS Course/Course 3/Projects/Google_12_months.xlsx')
transformed_google_df = transform(google_df, google_rename_mappings, max_score)

trustpilot_df = extract('/content/drive/MyDrive/DS Course/Course 3/Projects/Trustpilot_12_months.xlsx')
transformed_trustpilot_df = transform(trustpilot_df, trustpilot_rename_mappings, max_score)

In [None]:
final_df = combine_datasets([transformed_google_df, transformed_trustpilot_df])
final_df.head()

# Preprocessing

In [None]:
def clean_text(text: str) -> str:
    """
    Clean the input text by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Removing numbers
    4. Removing extra whitespace
    """
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
def tokenize_text(text: str) -> list:
    """
    Tokenize the input text.
    """
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

In [None]:
def get_stopwords(extra_words):
    """
    Returns a set of English stopwords with optional extra words.
    """
    stop_words = set(stopwords.words("english"))
    if extra_words:
        stop_words.update(extra_words)
    return stop_words

In [None]:
def remove_stopwords(tokens, stop_words):
    """
    Remove stopwords from the list of tokens.
    """
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [None]:
def _map_pos(tag):
    """
    Internal helper function.
    Maps NLTK POS tags to WordNet POS tags for lemmatization.
    """
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    return tag_dict.get(tag[0].upper(), wordnet.NOUN)


def lemmatize_tokens(tokens: list, lemmatizer) -> list:
    """
    Lemmatize the input tokens using POS tagging.
    """
    tagged = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, _map_pos(tag)) for word, tag in tagged]

In [None]:
class TextPreprocessor:
    """
    Class to handle text preprocessing steps for gym reviews analysis.
    """

    def __init__(self, extra_stop_words):
        self.stop_words = get_stopwords(extra_stop_words)
        self.lemmatizer = WordNetLemmatizer()

    def preprocess(self, text: str) -> list[str]:
        """
        Preprocess the input text by:
        1. Cleaning the text
        2. Tokenizing the text
        3. Removing stopwords
        4. Lemmatizing the tokens
        """
        cleaned_text = clean_text(text)
        tokens = tokenize_text(cleaned_text)
        filtered_tokens = remove_stopwords(tokens, self.stop_words)
        lemmas = lemmatize_tokens(filtered_tokens, self.lemmatizer)

        return lemmas

In [None]:
lemmatizer = WordNetLemmatizer()

preprocessor = TextPreprocessor(extra_stop_words = ['gym', 'club', 'workout'])
text = 'This is a TEST sentence... 1, 2, 3 - Yay, it worked. I love the GYM.'
preprocessor.preprocess(text)

In [None]:
final_df['clean_reviews'] = final_df['review'].apply(preprocessor.preprocess)
final_df.head()

# BERTopic

In [None]:
!pip install -q bertopic
from bertopic import BERTopic
print('BERTopic downloaded and improted')

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=False, verbose=True)

In [None]:
# Use only 200 for prototyping/experimenting

reviews = final_df['clean_reviews'][:200].tolist()
reviews_list = [" ".join(tokens) for tokens in reviews]

In [None]:
topics, probs = topic_model.fit_transform(reviews_list)

In [None]:
topic_info = topic_model.get_topic_info()
topic_info[:10]

In [None]:
topic_info['Representation'][0]

In [None]:
topic_info['Representative_Docs'][0]

In [None]:
topic_info['Representative_Docs'][:10]

# Formatting topics into json

In [None]:
import json

top_topics_docs = topic_info['Representative_Docs'][:10].tolist()

# pair each cluster with its docs
clusters = {
    f"cluster_{i+1}": docs
    for i, docs in enumerate(top_topics_docs)
}

formatted_input = json.dumps(clusters, indent=2)
print(formatted_input)

#Â Open AI

In [None]:
from openai import OpenAI
import os
from google.colab import drive, userdata

api_key = userdata.get('OPEN_AI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key

client = OpenAI()

In [None]:
response = client.responses.create(
    model="gpt-5-mini",
    input=f"Summarize the sentence {topic_info['Representative_Docs'][0]}"
)

print(response.output_text)

In [None]:
response = client.responses.create(
    model="gpt-5-mini",
    input=f"""You are a data analyst.
        You have grouped customer reviews into 10 clusters.
        Analyse the groups and categorise them.
        The topics are {formatted_input}
    """
)

print(response.output_text)

## Prompt Engineering

In [None]:
developer_prompt = """You are an expert data analyst specialising in customer feedback analysis.
You are providing recommmendations on how to improve operations and customer service.
You will always respond in valid JSON format.
Never include any text outside of the JSON structure."""

In [None]:
user_prompt = f"""I have grouped gym customer reviews into 10 clusters using topic modelling.

Analyse each cluster and return a JSON array in this exact format:
[
  {{
    "cluster": 1,
    "category": "Staff & Customer Service",
    "summary": "Customers frequently mention...",
    "sentiment": "Negative",
    "recommendation": "Improve customer service by..."
  }}
]

Analyse the emtional sentiment for each topic.

Here are the clusters:
{formatted_input}"""

In [None]:
response = client.responses.create(
    model="gpt-5-mini",
    input=[
        {"role": "developer", "content": developer_prompt},
        {"role": "user", "content": user_prompt}
    ]
)

In [None]:
print(response.output_text)

# BERTopic Model class

In [None]:
def join_tokens(text):
    return " ".join(text)

df = final_df['clean_reviews'].apply(join_tokens)

In [None]:
clean_reviews = final_df['clean_reviews'].apply(join_tokens)[:200].tolist()

In [None]:
topic_model_parmas = {
    "language": "english",
    "calculate_probabilities": False,
    "verbose": False,
    "embedding_model" : "all-MiniLM-L6-v2",
    "low_memory" : True,
    "nr_topics" : "auto",
}

In [None]:
class TopicModel:
    def __init__(self):
        self.model = BERTopic(**topic_model_parmas)

    def fit(self, df):
        topics, probs = self.model.fit_transform(df)
        return topics, probs

    def save_model(self, path):
        self.model.save(path)

    def load_model(self, path):
        self.model = BERTopic.load(path)

    def get_topic_info(self):
        return self.model.get_topic_info()

In [None]:
bertie = TopicModel()
topics, probs = bertie.fit(clean_reviews)

In [None]:
topics_info_df = bertie.get_topic_info()
topics_info_df

# JSON function

In [None]:
import json

def format_json(topic_info):
    top_topics_docs = topic_info.tolist()

    clusters = {
        f"cluster_{i+1}": docs
        for i, docs in enumerate(top_topics_docs)
    }

    formatted_input = json.dumps(clusters, indent=2)
    return formatted_input

In [None]:
formatted_input = format_json(topics_info_df['Representative_Docs'])
print(formatted_input)

# OpenAI Class

In [None]:
api_key = userdata.get('OPEN_AI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key

class InsightGenerator:
    def __init__(self, config, api_key, model="gpt-5-mini"):

        self.config = config
        self.client = OpenAI(api_key=api_key)
        self.model = model
        self.developer_prompt = config['developer_prompt']
        self.user_prompt = config['user_prompt']

    def _build_user_prompt(self, formatted_clusters: str) -> str:
        return self.user_prompt.format(clusters=formatted_clusters)

    def generate_insights(self, formatted_clusters: str) -> str:
        response = self.client.responses.create(
            model=self.model,
            input=[
                {"role": "developer", "content": self.developer_prompt},
                {"role": "user", "content": self._build_user_prompt(formatted_clusters)}
            ]
        )
        return response.output_text



In [None]:
# developer_prompt = """You are an expert data analyst specialising in customer feedback analysis for a global gym company.
# You are providing insights and recommmendations on how to improve operations and customer service to key stakeholders.
# You will always respond in valid JSON format.
# Never include any text outside of the JSON structure."""

In [None]:
# user_prompt = """I have grouped gym customer reviews into clusters using topic modelling.

# Analyse each cluster and return a JSON array. Below is an example of the expected format:
# [
#   {{
#     "cluster": 1,
#     "category": "Staff & Customer Service",
#     "brief_summary": "Customers frequently mention...",
#     "recommendation": "Improve customer service by..."
#   }}
# ]

# Here are the clusters:
# {clusters}"""

In [None]:
developer_prompt = """
You are an expert data analyst specializing in customer feedback analysis for a global gym company.
You provide concise insights and actionable recommendations to improve operations and customer service.
You will always respond in valid JSON format with no text outside the JSON structure.
Keep key_insights to 2 sentences maximum and each recommendation to 1 sentence maximum.
Return a JSON array, one object per cluster, in this exact structure:
[
  {
    "cluster": 1,
    "category": "Staff & Customer Service",
    "key_insights": "Customers frequently report...",
    "recommendation": "Implement monthly staff training...",
    "justification": "Addresses the most common complaint..."
  }
]
"""

In [None]:
user_prompt: """
I have grouped gym customer reviews into clusters using topic modelling.
Analyze each cluster and return a JSON array, one object per cluster.
Below is an example of the expected format:
[
  {
    "cluster": 1,
    "category": "Staff & Customer Service",
    "key_insights": "Customers frequently report...",
    "recommendation": "Implement monthly staff training...",
    "justification": "Addresses the most common complaint..."
  }
]
Here are the clusters:
{clusters}"""

In [None]:
config = {
    "developer_prompt": developer_prompt,
    "user_prompt": user_prompt
    }

In [None]:
print(formatted_input)

In [None]:
insight_generator = InsightGenerator(config, api_key)

In [None]:
insights = insight_generator.generate_insights(formatted_input)

In [None]:
print(insights)

In [None]:
def parse_insights(raw_response: str) -> pd.DataFrame:
    """
    Parses the response from the OpenAI API and converts it into a DataFrame.
    """
    cleaned = raw_response.strip().strip("```json").strip("```").strip()
    data = json.loads(cleaned)
    return pd.DataFrame(data)


In [None]:
insights_df = parse_insights(insights)
insights_df