In [1]:
# Purpose: Import required libraries
import pandas as pd
import numpy as np

In [2]:
# Purpose: Load cleaned related queries dataset
df_related_cleaned = pd.read_csv("../data/processed/related_queries_cleaned.csv")

# Preview
print("✅ Loaded Shape:", df_related_cleaned.shape)
display(df_related_cleaned.head())

✅ Loaded Shape: (250, 4)


Unnamed: 0,keyword,related_query,query_type,popularity_score
0,meditation,sleep meditation,top,100
1,meditation,meditation music,top,99
2,meditation,guided meditation,top,74
3,meditation,meditation youtube,top,73
4,meditation,yoga,top,55


In [3]:
# Purpose: Extract top 10 related queries for each keyword
df_top_queries = (
    df_related_cleaned[df_related_cleaned["query_type"] == "top"]
    .sort_values(["keyword", "popularity_score"], ascending=[True, False])
    .groupby("keyword")
    .head(10)
    .reset_index(drop=True)
)

display(df_top_queries)

Unnamed: 0,keyword,related_query,query_type,popularity_score
0,breathwork,holotropic breathwork,top,100
1,breathwork,breathwork training,top,59
2,breathwork,breathwork meditation,top,41
3,breathwork,what is breathwork,top,36
4,breathwork,breathwork near me,top,32
5,breathwork,somatic breathwork,top,28
6,breathwork,breathwork online,top,22
7,breathwork,breathwork benefits,top,22
8,breathwork,breath work,top,16
9,breathwork,breathwork classes,top,16


In [4]:
# Purpose: Extract top 10 rising queries for each keyword
df_rising_queries = (
    df_related_cleaned[df_related_cleaned["query_type"] == "rising"]
    .sort_values(["keyword", "popularity_score"], ascending=[True, False])
    .groupby("keyword")
    .head(10)
    .reset_index(drop=True)
)

display(df_rising_queries)

Unnamed: 0,keyword,related_query,query_type,popularity_score
0,breathwork,gary brecka breathwork,rising,31500
1,breathwork,somatic release breathwork,rising,30450
2,breathwork,othership breathwork,rising,25650
3,breathwork,breathwork sessie,rising,19950
4,breathwork,9d breathwork,rising,11100
5,breathwork,owaken breathwork,rising,3250
6,breathwork,witality breathwork,rising,2850
7,breathwork,somatic breathwork near me,rising,900
8,breathwork,breathwork o que é,rising,800
9,breathwork,breathwork class london,rising,750


In [5]:
# Purpose: Count how many keywords each related query appears under
query_keyword_counts = (
    df_related_cleaned.groupby("related_query")["keyword"]
    .nunique()
    .reset_index(name="num_keywords")
    .sort_values(by="num_keywords", ascending=False)
)

# Preview queries that appear across multiple keywords
display(query_keyword_counts.head(15))

Unnamed: 0,related_query,num_keywords
133,mindfulness meditation,3
114,mindfulness,2
91,meditation,2
105,meditation music,2
99,meditation for anxiety,2
100,meditation for sleep,2
174,sleep meditation,2
150,mindfulness traducción,1
149,mindfulness therapy,1
148,mindfulness techniques,1


In [6]:
# Filter for queries that appear in 2 or more keywords
shared_queries = query_keyword_counts[query_keyword_counts["num_keywords"] >= 2]

# Merge to get full info for shared queries
df_shared_queries = (
    df_related_cleaned.merge(shared_queries, on="related_query", how="inner")
    .sort_values(by=["num_keywords", "related_query", "keyword"], ascending=[False, True, True])
    .reset_index(drop=True)
)

# Preview
display(df_shared_queries.head(15))

Unnamed: 0,keyword,related_query,query_type,popularity_score,num_keywords
0,guided meditation,mindfulness meditation,top,10,3
1,meditation,mindfulness meditation,top,33,3
2,mindfulness,mindfulness meditation,top,95,3
3,mindfulness,meditation,top,100,2
4,yoga nidra,meditation,top,63,2
5,guided meditation,meditation for anxiety,top,18,2
6,guided meditation,meditation for anxiety,rising,80,2
7,meditation,meditation for anxiety,top,17,2
8,guided meditation,meditation for sleep,top,34,2
9,meditation,meditation for sleep,top,31,2


In [7]:
# 💾 Save Top 10 related queries per keyword (type: top)
df_top_queries.to_csv("../data/processed/related_queries_top10.csv", index=False)

# 💾 Save Top 10 rising related queries per keyword (type: rising)
df_rising_queries.to_csv("../data/processed/related_queries_rising10.csv", index=False)

# 💾 Save related queries that appear across 2 or more keywords
df_shared_queries.to_csv("../data/processed/related_queries_shared.csv", index=False)

# 💾 Save summary: number of keywords each related query appears in
query_keyword_counts.to_csv("../data/processed/related_query_keyword_counts.csv", index=False)