In [1]:
import hdbscan
import pandas as pd

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
df = pd.read_parquet('../data/merged_data.parquet')
hpb_df = df[df['pr_name'].fillna('').str.contains('Health Promotion Board')]
hpb_df = hpb_df[(hpb_df['to_remove'] == False) & (hpb_df['content_category'] == 'live-healthy-articles')]
hpb_df.head(3)
print(hpb_df.columns)
hpb_df.shape

Index(['id', 'content_name', 'title', 'article_category_names',
       'cover_image_url', 'full_url', 'full_url2', 'friendly_url',
       'category_description', 'content_body', 'keywords', 'feature_title',
       'pr_name', 'alternate_image_text', 'date_modified', 'number_of_views',
       'last_month_view_count', 'last_two_months_view', 'page_views',
       'engagement_rate', 'bounce_rate', 'exit_rate', 'scroll_percentage',
       'percentage_total_views', 'cumulative_percentage_total_views',
       'content_category', 'to_remove', 'has_table', 'has_image',
       'related_sections', 'extracted_links', 'extracted_headers',
       'extracted_content_body'],
      dtype='object')


(623, 33)

### Used extractive summary 
- Note: For this notebook, instead of mean pooling the chunks embeddings, extractive summary of top20 sentences were used before embedding

In [3]:
def are_similar(sent1, sent2, overlap_threshold=10):
    words1 = set(sent1.split())
    words2 = set(sent2.split())
    overlap = words1.intersection(words2)
    return len(overlap) > overlap_threshold

def extractive_summary(text, sentences_count=15):
    # Initialize parser and summarizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()

    # Generate summary
    summary = summarizer(parser.document, sentences_count)

    # Extract the sentences and return them as a list, removing duplicates
    seen_sentences = set()
    summary_sentences = []
    for sentence in summary:
        sentence_str = str(sentence)
        if not any(are_similar(sentence_str, seen_sentence) for seen_sentence in seen_sentences):
            seen_sentences.add(sentence_str)
            summary_sentences.append(sentence_str)
    return ' '.join(summary_sentences)

In [4]:
hpb_df['extractive_summary'] = hpb_df['extracted_content_body'].apply(lambda x: extractive_summary(x, sentences_count=20))

In [5]:
#From official documentation

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=24)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=4, min_samples=2, metric='manhattan', cluster_selection_method='eom', prediction_data=True, gen_min_span_tree=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
  # nr_topics="auto" #default is none, will auto reduce topics using HDBSCAN
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
docs = hpb_df['extractive_summary'].to_list()
# embeddings = np.array(df['doc_embeddings'].to_list())
doc_titles = hpb_df['title'].to_list()

topics, _ = topic_model.fit_transform(docs)
# Get value count per topic
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,75,-1_fat_healthy_child_stress,"[fat, healthy, child, stress, health, fruit, e...",[Nutritional Requirements for Children after t...
1,0,41,0_smoking_nicotine_quitting_cigarette,"[smoking, nicotine, quitting, cigarette, smoke...",[The harmful effects of smoking are known to a...
2,1,26,1_pregnancy_weeks_womb_pregnant,"[pregnancy, weeks, womb, pregnant, bleeding, n...",[Anaemia Anaemia is a condition in which mums-...
3,2,26,2_eye_eyes_lenses_myopia,"[eye, eyes, lenses, myopia, screen, activities...",[Myopia can be delayed or prevented by encoura...
4,3,21,3_teeth_dental_dentist_toothpaste,"[teeth, dental, dentist, toothpaste, brushing,...",[By the time your child has turned one and a h...
5,4,20,4_flu_diseases_infectious_mumps,"[flu, diseases, infectious, mumps, vaccines, i...",[What Is the Flu? Most of us have caught the f...
6,5,19,5_diabetes_glucose_type_risk,"[diabetes, glucose, type, risk, control, stres...","[Thats more than double the 440,000 Singaporea..."
7,6,18,6_exercise_fitness_activity_step,"[exercise, fitness, activity, step, tracker, g...","[By now, we're familiar with the benefits of e..."
8,7,17,7_milk_formula_nutrition_solids,"[milk, formula, nutrition, solids, little, cre...",[Hes able to pick up objects with his thumb an...
9,8,16,8_hawker_healthier_sodium_soup,"[hawker, healthier, sodium, soup, rice, food, ...",[Healthier Singapore Hawker Food The next time...


In [7]:
top_n = 50
top_topics = topic_model.get_topic_freq().head(top_n)['Topic'].tolist()
reduced_embeddings = topic_model.umap_model.embedding_
hover_data = [f"{title} - Topic {topic}" for title, topic in zip(doc_titles, topics)]
visualization = topic_model.visualize_documents(hover_data, reduced_embeddings=reduced_embeddings,topics=top_topics, title=f'Top {top_n} Topics') 
visualization.show() 

visualization_barchart = topic_model.visualize_barchart(top_n_topics=top_n)
visualization_barchart.show()

# visualization_path = f"visualization.html"
# barchart_path = f"visualization_barchart.html"

# visualization.write_html(visualization_path)
# visualization_barchart.write_html(barchart_path)

### Search for best parameters for hdbscan
- Get the parameters that gives the best DBCV scores
- Update the parmeters in Step 3 if needed
- Rerun BertTopic


In [8]:
embeddings = topic_model.umap_model.embedding_  #reduced embeddings from umap

best_score = 0

for min_cluster_size in [2,3,4,5,6]:
    for min_samples in [1,2,3,4,5,6,7]:
        for cluster_selection_method in ['eom','leaf']:
            for metric in ['euclidean','manhattan']:
                # for each combination of parameters of hdbscan
                hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,min_samples=min_samples,
                                      cluster_selection_method=cluster_selection_method, metric=metric, 
                                      gen_min_span_tree=True).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {'min_cluster_size': min_cluster_size, 
                               ' min_samples':  min_samples, 'cluster_selection_method': cluster_selection_method,
                              'metric': metric}

print("Best DBCV score: {:.3f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

Best DBCV score: 0.362
Best parameters: {'min_cluster_size': 4, ' min_samples': 2, 'cluster_selection_method': 'eom', 'metric': 'manhattan'}


### Hierachical clustering of topics


In [9]:
hierarchical_topics  = topic_model.hierarchical_topics(docs)
hierarchical_topics

100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
49,100,food_healthy_health_sugar_body,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",98,food_healthy_sugar_rice_eat,99,baby_smoking_health_exercise_children,2.025874
48,99,baby_smoking_health_exercise_children,"[0, 1, 2, 3, 4, 6, 9, 10, 11, 12, 13, 16, 21, ...",91,exercise_fitness_intensity_exercises_activities,97,baby_smoking_quit_children_dental,1.394843
47,98,food_healthy_sugar_rice_eat,"[5, 7, 8, 14, 15, 17, 18, 19, 20, 23, 24, 26, ...",88,rice_tbsp_cooking_chicken_wholegrain,95,diabetes_healthy_fruit_eating_diet,1.332233
46,97,baby_smoking_quit_children_dental,"[0, 1, 2, 3, 4, 10, 11, 12, 16, 22, 25, 28, 30...",94,smoking_quit_alcohol_nicotine_dementia,96,teeth_children_dental_flu_babys,1.285195
45,96,teeth_children_dental_flu_babys,"[1, 2, 3, 4, 16, 22, 25, 31, 33, 41, 48]",87,teeth_dental_tooth_toddler_toothpaste,82,sleep_pregnancy_flu_doctor_babys,1.120715
44,95,diabetes_healthy_fruit_eating_diet,"[5, 7, 14, 15, 17, 19, 20, 23, 26, 27, 32, 34,...",93,food_sugar_healthy_eating_healthier,58,diabetes_glucose_type_risk_cholesterol,1.109521
43,94,smoking_quit_alcohol_nicotine_dementia,"[0, 10, 11, 12, 28, 30, 36, 40, 50]",86,smoking_nicotine_quitting_drinking_smokers,85,dementia_mental_self_positive_social,1.075867
42,93,food_sugar_healthy_eating_healthier,"[7, 14, 15, 17, 19, 20, 23, 26, 27, 32, 34, 35...",92,vegetables_healthy_eating_meals_diet,79,sugar_tea_healthier_drinks_chinese,1.056618
41,92,vegetables_healthy_eating_meals_diet,"[7, 14, 15, 17, 19, 26, 27, 32, 34, 44, 46]",84,weight_bmi_healthy_percentile_calories,81,vegetables_milk_healthy_protein_meal,1.046121
40,91,exercise_fitness_intensity_exercises_activities,"[6, 9, 13, 21, 38, 45, 47, 49]",90,falls_park_tiong_singapores_bahru,89,exercise_physical_fitness_intensity_exercises,0.979286


In [10]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) 

In [11]:
fig.show()

### Topics that are similar can be merged

In [12]:
distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(10)

Unnamed: 0,topic1,topic2,distance
1449,26_food_kids_healthy_bento,44_meals_eating_healthier_snacks,0.94877
735,13_exercise_workout_fitness_workouts,6_exercise_fitness_activity_step,0.943063
374,6_exercise_fitness_activity_step,9_intensity_exercise_aerobic_heart,0.933169
738,13_exercise_workout_fitness_workouts,9_intensity_exercise_aerobic_heart,0.928875
789,14_quarter_plate_healthy_vegetables,8_hawker_healthier_sodium_soup,0.919479
1013,18_tbsp_stock_chopped_sauce,24_tbsp_sliced_recipe_chicken,0.911631
798,14_quarter_plate_healthy_vegetables,17_protein_calcium_vitamin_diet,0.907685
1865,34_food_meal_eating_meals,44_meals_eating_healthier_snacks,0.906821
810,14_quarter_plate_healthy_vegetables,29_wholegrain_wholegrains_symbol_rice,0.903522
1412,26_food_kids_healthy_bento,7_milk_formula_nutrition_solids,0.902
