## Search Volumes

In [38]:
# imports
import pandas as pd
import os
from dotenv import load_dotenv
from serpapi import GoogleSearch
from datetime import date
import nltk
from nltk.corpus import words


### Importing Data

Importing search terms csv as python dataframe.

In [39]:
# Getting serpapi key
load_dotenv()
api_key = os.getenv("SERPAI_KEY")

# Read the CSV file, skipping the first two rows
search_terms_df = pd.read_csv("Search_terms.csv", skiprows=2)

  search_terms_df = pd.read_csv("Search_terms.csv", skiprows=2)


In [40]:
search_terms_df.head()

Unnamed: 0,Search term,Match type,Added/Excluded,Campaign,Ad group,Currency code,Avg. CPM,Impr.,Interactions,Interaction rate,Avg. cost,Cost,Campaign type,Conv. rate,Conversions,Cost / conv.
0,mental health support line,Exact match (close variant),,Nebo - Helmsley Grant - Nonbrand - Crisis Support,Helpline / Hotline,USD,280.0,40,5,12.50%,2.24,11.2,Search,180.00%,9.0,1.24
1,depression support group,Broad match,,Nebo - Helmsley Grant - Nonbrand - Crisis Support,Helpline / Hotline,USD,187.06,17,1,5.88%,3.18,3.18,Search,100.00%,1.0,3.18
2,teen help hotline,Broad match,,Nebo - Helmsley Grant - Nonbrand - Crisis Support,Suicide Prevention,USD,456.25,16,4,25.00%,1.83,7.3,Search,100.00%,4.0,1.83
3,family acceptance project lds,Broad match,,Nebo - Helmsley Grant - Nonbrand - Crisis Support,Helpline / Hotline,USD,840.0,1,1,100.00%,0.84,0.84,Search,100.00%,1.0,0.84
4,hope line,Broad match,,Nebo - Helmsley Grant - Nonbrand - Crisis Support,Suicide Prevention,USD,605.0,6,1,16.67%,3.63,3.63,Search,200.00%,2.0,1.82


In [41]:
search_terms_df.size

2515920

### Removing Duplicates

In [42]:
len(search_terms_df["Search term"].value_counts())

125331

In [None]:
unique_search_terms = search_terms_df["Search term"].unique().tolist()

['mental health support line',
 'depression support group',
 'teen help hotline',
 'family acceptance project lds',
 'hope line',
 'mental health support for homeless',
 'helpline',
 'trevor project number',
 'where to talk to someone about depression',
 'text for mental health',
 'what is the helpline',
 '24hr mental health line',
 'suicide chat line',
 'phone numbers to call when you need someone to talk to',
 'places to call for help',
 'international mental health hotline',
 'crisis intervention',
 'crisis hotline iowa',
 'trevor text line',
 'i just need to talk to somebody',
 'washington state crisis hotline',
 'depressed chat',
 '24 7 hotline',
 'grief support number',
 'lgbtq housing programs',
 'call if you need help',
 'hotline 211',
 'what to say to someone who is mentally struggling',
 'how to help your child with suicidal thoughts',
 'gay relationship advice',
 'parents help line',
 'social support groups near me',
 'youth mental health resources',
 'how to help someone co

### Filtering Search Terms for Two Words or Less

In [65]:
short_terms = [term for term in unique_search_terms if len(term.split()) <= 2]
len(short_terms)

36847

### Removing Search Terms With Typos

In [66]:
# Download once (if you haven't already)
nltk.download('words')

# English vocabulary set
english_vocab = set(w.lower() for w in words.words())

# Filter list: keep if 1 or 2 words, and all words are real
filtered_terms = [
    term for term in short_terms
    if 1 <= len(term.split()) <= 2 and all(word.lower() in english_vocab for word in term.split())
]

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\emily\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [67]:
filtered_terms

['hope line',
 'crisis intervention',
 'depressed chat',
 'at crisis',
 'ask help',
 'teen psychologist',
 'he they',
 'pansexual meaning',
 'dominant lesbian',
 'queer news',
 'inclusive sexuality',
 'bisexual awareness',
 'gay commitment',
 'homosexuality lecture',
 'situational homosexuality',
 'bisexual erasure',
 'gay newsletter',
 'bisexual test',
 'gay men',
 'bisexual cock',
 'become female',
 'lesbian flag',
 'queer world',
 'gay stereotype',
 'queer person',
 'she they',
 'chosen donate',
 'pride store',
 'glisten organization',
 'toy donation',
 'donate canada',
 'heifer donate',
 'pride austin',
 'coastside pride',
 'pride community',
 'forced donation',
 'pride live',
 'donate california',
 'call pride',
 'uta pride',
 'pow donation',
 'charity matching',
 'resupply donation',
 'corporate pride',
 'honorary donation',
 'simple give',
 'echo charity',
 'charity good',
 'pride ride',
 'arc donation',
 'northwest donation',
 'queer day',
 'homosexual ad',
 'donate me',
 'disa

In [68]:
len(filtered_terms)

9680

In [69]:
# Convert to DataFrame
search_terms_df = pd.DataFrame(filtered_terms, columns=['Search Term'])

In [70]:
search_terms_df.size

9680

### Removing Terms Without search_interest_over_time Values

In [71]:
# Checking if keyword has interest_over_time data and removing that keyword if it doesn't
def check_interest(keyword, api_key):
    """Return True if keyword has Google Trends interest, False otherwise."""
    params = {
        "engine": "google_trends",
        "q": keyword,
        "api_key": api_key
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    # Safely get timeline_data
    timeline_data = results.get("interest_over_time", {}).get("timeline_data", [])

    return bool(timeline_data)  # True if non-empty, False if empty

# Apply to dataframe and filter rows
search_terms_df["Has Interest"] = search_terms_df["Search Term"].apply(lambda x: check_interest(x, api_key))
filtered_df = search_terms_df[search_terms_df["Has Interest"]]
filtered_df = filtered_df.drop(columns=["Has Interest"])

print("dataframe size: " + str(filtered_df.size))

KeyboardInterrupt: 

### Finding Percent Change

In [None]:
def percent_change_from_timeline(timeline_data):
    """Calculate percent change from timeline data (assumes data exists)."""
    first_value = timeline_data[0]['values'][0]['extracted_value']
    last_value = timeline_data[-1]['values'][0]['extracted_value']

    if first_value == 0:
        return None  # Avoid divide by zero

    return ((last_value - first_value) / first_value) * 100

In [None]:
# Apply percent change calculation to each search term
filtered_df["Growth Percentage"] = filtered_df["Search term"].apply(
    lambda keyword: percent_change_from_timeline(get_interest_data(keyword, api_key))
)

sorted_search_terms_df = filtered_df.sort('Growth Percentage', descending = True)

# Return top 10 keywords
top_10_keywords = sorted_search_terms_df["Search term"].head(10).tolist()
top_10_keywords

KeyboardInterrupt: 

interest_over_time in SerpApi’s Google Trends engine refers to how popular a search term is over a specified period, Google doesn't publicly release information about search volume. 

However, there is publicly released information about Google Ad search volume. If we want to be more accurate, we can try to benchmark some of our search words against that in the future.

make a hashmap, assign -10000 for terms that don't show up