# 01 — Data Ingestion Prototype (⚠️ Archived)

This notebook shows the **first manual prototype** of data ingestion for the Meditation Trend Pulse project.  

- It pulls **Google Trends data** (interest over time, interest by country, related queries).  
- Originally, these outputs were saved under `../data/raw/`.  
- **Note:** This workflow has been fully replaced by the automated script [`automation/update_all_datasets.py`](../automation/update_all_datasets.py), which now runs daily under cron, handles retries, and produces production-ready datasets in `../data/streamlit/`.  

👉 This notebook is kept **for historical/portfolio purposes** to demonstrate the project’s evolution from manual ingestion → automation.

In [None]:
# 📦 Imports
from pytrends.request import TrendReq
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os

# Start pytrends session
pytrends = TrendReq(hl='en-US', tz=0)

keywords = ["meditation", "mindfulness", "breathwork", "guided meditation", "yoga nidra"]

# Build payload for all keywords
pytrends.build_payload(kw_list=keywords, timeframe="today 5-y", geo="")

# Fetch weekly interest over time
df_trends = pytrends.interest_over_time()

# Drop 'isPartial' column if it exists
if 'isPartial' in df_trends.columns:
    df_trends = df_trends.drop(columns='isPartial')

# Preview result
df_trends.head()

# Initialize list to store each keyword's region data
country_frames = []

for kw in keywords:
    pytrends.build_payload([kw], timeframe="today 5-y", geo="")
    df_region = pytrends.interest_by_region()

    # Filter out rows with 0 interest
    df_region = df_region[df_region[kw] > 0].reset_index()

    # Rename columns for consistency
    df_region = df_region[["geoName", kw]]
    df_region.columns = ["country", "interest"]
    df_region["keyword"] = kw

    country_frames.append(df_region)

# Combine all keywords into one DataFrame
df_country = pd.concat(country_frames, ignore_index=True)

# Preview result
df_country.head()

# Prepare empty list to collect rows
related_rows = []

# Loop through each keyword
for kw in keywords:
    pytrends.build_payload([kw], timeframe="today 5-y", geo="")

    related = pytrends.related_queries()
    
    if kw in related:
        for qtype in ['top', 'rising']:
            df_q = related[kw].get(qtype)
            if df_q is not None:
                for _, row in df_q.iterrows():
                    related_rows.append({
                        'keyword': kw,
                        'query': row['query'],
                        'type': qtype,
                        'value': row['value']
                    })

# Create combined DataFrame
df_related_queries = pd.DataFrame(related_rows)

# Preview
df_related_queries.head()

os.makedirs("../data/raw", exist_ok=True)

# Save the datasets
df_trends.to_csv("../data/raw/interest_over_time.csv")
df_country.to_csv("../data/raw/interest_by_country.csv")
df_related_queries.to_csv("../data/raw/related_queries.csv")