[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sandy-lee29/musicapp-review-analysis/blob/main/top_sub_issue_grouping.ipynb)


In [1]:
!pip install -q sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
# Step 1. Load review data and remove NaNs/duplicates
df = pd.read_csv('Music_1000.csv')
df_aspects = df[['topic', 'aspect']].dropna().drop_duplicates()

In [4]:
# Step 2. Load sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Step 3. Define greedy semantic clustering function
def cluster_similar_topics(topics, threshold=0.6):
    embeddings = model.encode(topics)
    sim_matrix = cosine_similarity(embeddings)

    clusters = {}
    visited = set()

    for i, topic in enumerate(topics):
        if topic in visited:
            continue
        clusters[topic] = [topic]
        visited.add(topic)
        for j in range(i + 1, len(topics)):
            if topics[j] not in visited and sim_matrix[i][j] >= threshold:
                clusters[topic].append(topics[j])
                visited.add(topics[j])

    mapping = {}
    for rep, group in clusters.items():
        for topic in group:
            mapping[topic] = rep

    return mapping, clusters

In [6]:
# Step 4. Apply greedy clustering to each topic group
clustered_results = []

for topic, group in df_aspects.groupby('topic'):
    aspects = group['aspect'].tolist()
    if len(aspects) == 1:
        clustered_results.append({
            'topic': topic,
            'top_issue': aspects[0],
            'sub_issues': []
        })
        continue

    mapping, clusters = cluster_similar_topics(aspects, threshold=0.6)

    for top_issue, group in clusters.items():
        sub_issues = [asp for asp in group if asp != top_issue]
        clustered_results.append({
            'topic': topic,
            'top_issue': top_issue,
            'sub_issues': sub_issues
        })


In [7]:
# Step 5. Clean sub-issue labels (if identical to top issue)
result_df = pd.DataFrame(clustered_results)

def clean_sub_issues(row):
    top = row['top_issue'].strip().lower()
    new_subs = []
    for sub in row['sub_issues']:
        if sub.strip().lower() == top:
            new_subs.append(f"{sub} - general issue")
        else:
            new_subs.append(sub)
    return new_subs

result_df['sub_issues'] = result_df.apply(clean_sub_issues, axis=1)
result_df.to_csv("clustered_issues_by_topic.csv", index=False)

In [8]:
# Step 6. Map top issue and sub-issue index back to original review data
df_clustered = pd.read_csv("clustered_issues_by_topic.csv")
df_clustered['sub_issues'] = df_clustered['sub_issues'].apply(eval)

aspect_to_top = {}
sub_issue_index_map = {}

for _, row in df_clustered.iterrows():
    top_issue = row['top_issue']
    sub_issues = row['sub_issues']
    aspect_to_top[top_issue] = top_issue
    sub_issue_index_map[top_issue] = None
    for i, sub in enumerate(sub_issues):
        aspect_to_top[sub] = top_issue
        sub_issue_index_map[sub] = i + 1

df["top_issue"] = df["aspect"].map(aspect_to_top)
df["sub_issue_index"] = df["aspect"].map(sub_issue_index_map)
df.to_csv("Music_1000_subissues.csv", index=False)

In [9]:
df = pd.read_csv("Music_1000_subissues.csv")