# 📘 Notebook 6: Final Data Export for Streamlit Dashboard

This notebook prepares the cleaned and analysis-ready datasets for use in the upcoming Streamlit dashboard.

We previously saved intermediate results during the Trend, Country, and Related Queries analyses. In this notebook, we simply load those final outputs, verify their structure, and export the required subsets into a dedicated folder (`../data/streamlit/`) for use in the dashboard.

This replaces the earlier Tableau prep plan and is part of transitioning this project into a fully automated and interactive data app.

In [1]:
import os
import pandas as pd

In [3]:
# 📥 Load final trend datasets
#df_trend = pd.read_csv("../data/raw/interest_over_time.csv")
#df_trend_smoothed = pd.read_csv("../data/processed/interest_over_time_smoothed.csv")
df_pct_change = pd.read_csv("../data/processed/interest_percent_change.csv")
df_top_peaks = pd.read_csv("../data/processed/interest_top_peaks.csv")
df_heatmap = pd.read_csv("../data/processed/interest_monthly_heatmap.csv")

# ✅ Confirm load
print("✅ Trend files loaded:")
print("📊 df_trend:", df_trend.shape)
print("📊 df_trend_smoothed:", df_trend_smoothed.shape)
print("📊 df_pct_change:", df_pct_change.shape)
print("📊 df_top_peaks:", df_top_peaks.shape)
print("📊 df_heatmap:", df_heatmap.shape)

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/interest_percent_change.csv'

In [5]:
# 🌍 Load final country-level datasets
df_country_long = pd.read_csv("../data/processed/country_interest_long.csv")
df_country_pivot = pd.read_csv("../data/processed/country_interest_pivot.csv")
df_country_coverage = pd.read_csv("../data/processed/country_num_countries_with_interest.csv")
df_country_total = pd.read_csv("../data/processed/country_total_interest_by_keyword.csv")
df_country_top10 = pd.read_csv("../data/processed/country_top10_by_keyword.csv")
df_country_share = pd.read_csv("../data/processed/country_share_of_interest.csv")
df_country_top5 = pd.read_csv("../data/processed/country_top5_appearance_counts.csv")

# ✅ Confirm load
print("✅ Country files loaded:")
print("🌍 df_country_long:", df_country_long.shape)
print("🌍 df_country_pivot:", df_country_pivot.shape)
print("🌍 df_country_coverage:", df_country_coverage.shape)
print("🌍 df_country_total:", df_country_total.shape)
print("🌍 df_country_top10:", df_country_top10.shape)
print("🌍 df_country_share:", df_country_share.shape)
print("🌍 df_country_top5:", df_country_top5.shape)

✅ Country files loaded:
🌍 df_country_long: (258, 3)
🌍 df_country_pivot: (70, 6)
🌍 df_country_coverage: (5, 2)
🌍 df_country_total: (5, 2)
🌍 df_country_top10: (50, 3)
🌍 df_country_share: (258, 4)
🌍 df_country_top5: (11, 2)


In [4]:
# 🔍 Load final related query datasets
df_related_cleaned = pd.read_csv("../data/processed/related_queries_cleaned.csv")
df_related_top10 = pd.read_csv("../data/processed/related_queries_top10.csv")
df_related_rising10 = pd.read_csv("../data/processed/related_queries_rising10.csv")
df_related_shared = pd.read_csv("../data/processed/related_queries_shared.csv")
df_related_keyword_counts = pd.read_csv("../data/processed/related_query_keyword_counts.csv")

# ✅ Confirm load
print("✅ Related queries files loaded:")
print("🔍 df_related_cleaned:", df_related_cleaned.shape)
print("🔍 df_related_top10:", df_related_top10.shape)
print("🔍 df_related_rising10:", df_related_rising10.shape)
print("🔍 df_related_shared:", df_related_shared.shape)
print("🔍 df_related_keyword_counts:", df_related_keyword_counts.shape)

✅ Related queries files loaded:
🔍 df_related_cleaned: (250, 4)
🔍 df_related_top10: (50, 4)
🔍 df_related_rising10: (50, 4)
🔍 df_related_shared: (16, 5)
🔍 df_related_keyword_counts: (229, 2)


In [3]:
import pandas as pd

# Load the raw data
df_trend = pd.read_csv("../data/raw/interest_over_time.csv", index_col=0)

# Ensure index isn't included as a column
df_trend.index.name = None  # remove index name if it exists
df_trend.reset_index(drop=True, inplace=True)

# Melt into long format
df_trend["date"] = pd.date_range(start="2020-07-26", periods=len(df_trend), freq="W-SUN")  # Or load from original if available
df_trend_long = df_trend.melt(id_vars="date", var_name="keyword", value_name="search_interest")

# Save
df_trend_long.to_csv("../data/streamlit/global_trend_summary.csv", index=False)

In [5]:
# 📁 1. Flatten interest_over_time for Tableau
# Purpose: Save cleaned trend analysis outputs to processed data folder
os.makedirs("../data/streamlit", exist_ok=True)

df_trend_long = df_trend.reset_index().melt(id_vars="date", var_name="keyword", value_name="search_interest")
df_trend_long.to_csv("../data/streamlit/global_trend_summary.csv", index=False)

# 📁 2. Save percent change over time
df_pct_change.columns = ["keyword", "percent_change"]
df_pct_change.to_csv("../data/streamlit/trend_pct_change.csv", index=False)

# 📁 3. Save top peaks per keyword
df_top_peaks_out = df_top_peaks[["date", "keyword", "search_interest"]].copy()
df_top_peaks_out.to_csv("../data/streamlit/trend_top_peaks.csv", index=False)

In [6]:
# 📁 1. Save long-form country interest data
df_country_long.to_csv("../data/streamlit/country_interest_summary.csv", index=False)

# 📁 2. Save total interest by keyword
df_country_total.to_csv("../data/streamlit/country_total_interest_by_keyword.csv", index=False)

# 📁 3. Save country appearance count in top 5 lists
df_country_top5.to_csv("../data/streamlit/country_top5_appearance_counts.csv", index=False)

In [7]:
# 📁 1. Save top 10 related queries per keyword
df_related_top10.to_csv("../data/streamlit/related_queries_top10.csv", index=False)

# 📁 2. Save rising related queries per keyword
df_related_rising10.to_csv("../data/streamlit/related_queries_rising10.csv", index=False)

# 📁 3. Save shared related queries across keywords
df_related_shared.to_csv("../data/streamlit/related_queries_shared.csv", index=False)

In [10]:
# ✅ FIXED version that handles 3 columns
df_country_top5_fixed = df_country_top5.reset_index()
df_country_top5_fixed.columns = ['keyword', 'country', 'total_interest']  # Adjust to actual column names

# Save to Streamlit folder
df_country_top5_fixed.to_csv("../data/streamlit/country_top5_appearance_counts.csv", index=False)

In [3]:
# ✅ Diagnostic check for all 3 country-level datasets for Streamlit

import pandas as pd

DATA_PATH = "../data/streamlit"

# --- 1. country_interest_summary.csv ---
print("\n--- Checking: country_interest_summary.csv ---")
try:
    df1 = pd.read_csv(f"{DATA_PATH}/country_interest_summary.csv")
    print(f"✅ Loaded successfully | Shape: {df1.shape}")
    print("🧾 Columns:", df1.columns.tolist())
    print("🔍 Sample row:\n", df1.head(1))
    
    expected_cols = {'country', 'keyword', 'date', 'interest'}
    missing = expected_cols - set(df1.columns)
    if missing:
        print(f"⚠️ Missing expected columns: {list(missing)}")
    else:
        print("✅ All expected columns present.")

except Exception as e:
    print("❌ Error loading file:", e)

# --- 2. country_total_interest_by_keyword.csv ---
print("\n--- Checking: country_total_interest_by_keyword.csv ---")
try:
    df2 = pd.read_csv(f"{DATA_PATH}/country_total_interest_by_keyword.csv")
    print(f"✅ Loaded successfully | Shape: {df2.shape}")
    print("🧾 Columns:", df2.columns.tolist())
    print("🔍 Sample row:\n", df2.head(1))
    
    expected_cols = {'country', 'keyword', 'total_interest'}
    missing = expected_cols - set(df2.columns)
    if missing:
        print(f"⚠️ Missing expected columns: {list(missing)}")
    else:
        print("✅ All expected columns present.")
        
except Exception as e:
    print("❌ Error loading file:", e)

# --- 3. country_top5_appearance_counts.csv ---
print("\n--- Checking: country_top5_appearance_counts.csv ---")
try:
    df3 = pd.read_csv(f"{DATA_PATH}/country_top5_appearance_counts.csv")
    print(f"✅ Loaded successfully | Shape: {df3.shape}")
    print("🧾 Columns:", df3.columns.tolist())
    print("🔍 Sample row:\n", df3.head(1))
    
    expected_cols = {'keyword', 'country', 'top5_count'}
    missing = expected_cols - set(df3.columns)
    if missing:
        print(f"⚠️ Missing expected columns: {list(missing)}")
    else:
        print("✅ All expected columns present.")
        
except Exception as e:
    print("❌ Error loading file:", e)


--- Checking: country_interest_summary.csv ---
✅ Loaded successfully | Shape: (350, 3)
🧾 Columns: ['country', 'keyword', 'interest']
🔍 Sample row:
      country     keyword  interest
0  Argentina  breathwork       3.0
⚠️ Missing expected columns: ['date']

--- Checking: country_total_interest_by_keyword.csv ---
✅ Loaded successfully | Shape: (350, 3)
🧾 Columns: ['country', 'keyword', 'total_interest']
🔍 Sample row:
      country     keyword  total_interest
0  Argentina  breathwork             3.0
✅ All expected columns present.

--- Checking: country_top5_appearance_counts.csv ---
✅ Loaded successfully | Shape: (11, 3)
🧾 Columns: ['keyword', 'country', 'top5_count']
🔍 Sample row:
    keyword  country  top5_count
0        0  Ireland           5
✅ All expected columns present.


In [9]:
# ✅ Reload base data: long-form interest data
df_country_long = pd.read_csv("../data/streamlit/country_interest_summary.csv")

# 🧼 Clean keyword and country columns
df_country_long["keyword"] = df_country_long["keyword"].astype(str).str.strip().str.lower()
df_country_long["country"] = df_country_long["country"].astype(str).str.strip()

# ✅ Count how many times each country appeared in the Top 5 for each keyword
df_top5 = (
    df_country_long
    .sort_values("interest", ascending=False)
    .groupby("keyword")
    .head(5)  # Take top 5 rows per keyword
    .groupby(["keyword", "country"])
    .size()
    .reset_index(name="top5_count")
)

# 💾 Save cleaned version
df_top5.to_csv("../data/streamlit/country_top5_appearance_counts.csv", index=False)
print("✅ Saved: country_top5_appearance_counts.csv")

✅ Saved: country_top5_appearance_counts.csv


In [11]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
df_rising = pd.read_csv("../data/streamlit/related_queries_rising10.csv")
df_rising["keyword"].value_counts()

keyword
breathwork           10
guided meditation    10
meditation           10
mindfulness          10
yoga nidra           10
Name: count, dtype: int64

In [6]:
print(df_related_shared.columns.tolist())

['keyword', 'related_query', 'query_type', 'popularity_score', 'num_keywords']


In [9]:
df = pd.read_csv("../data/streamlit/country_top5_appearance_counts.csv")

In [11]:
df

Unnamed: 0,keyword,country,top5_count
0,breathwork,Australia,1
1,breathwork,Ireland,1
2,breathwork,Netherlands,1
3,breathwork,New Zealand,1
4,breathwork,Switzerland,1
5,guided meditation,Australia,1
6,guided meditation,Canada,1
7,guided meditation,Ireland,1
8,guided meditation,New Zealand,1
9,guided meditation,United Kingdom,1
