In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.environ.get('GOOGLE_API_KEY')
engine_id = os.environ.get('SEARCH_ENGINE_ID')

In [30]:
search_string = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}&q='wojcicki'"

In [31]:
import requests

In [32]:
response = requests.get(search_string)

In [33]:
response

<Response [200]>

In [34]:
search_results =  response.json()

In [35]:
search_results

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': "Google Custom Search - 'wojcicki'",
    'totalResults': '3560000',
    'searchTerms': "'wojcicki'",
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'utf8',
    'safe': 'off',
    

In [36]:
search_results['items'][0]

{'kind': 'customsearch#result',
 'title': 'Susan Wojcicki - Wikipedia',
 'htmlTitle': 'Susan <b>Wojcicki</b> - Wikipedia',
 'link': 'https://en.wikipedia.org/wiki/Susan_Wojcicki',
 'displayLink': 'en.wikipedia.org',
 'snippet': 'Susan Wojcicki ... Susan Diane Wojcicki (/wʊˈtʃɪtski/ woo-CHITS-kee; born July 5, 1968) is an American business executive who was the chief executive officer (CEO)\xa0...',
 'htmlSnippet': 'Susan <b>Wojcicki</b> ... Susan Diane <b>Wojcicki</b> (/wʊˈtʃɪtski/ woo-CHITS-kee; born July 5, 1968) is an American business executive who was the chief executive officer (CEO)&nbsp;...',
 'cacheId': 'AVY79RRMlbEJ',
 'formattedUrl': 'https://en.wikipedia.org/wiki/Susan_Wojcicki',
 'htmlFormattedUrl': 'https://en.wikipedia.org/wiki/Susan_<b>Wojcicki</b>',
 'pagemap': {'hcard': [{'role': 'Business manager and executive advisor',
    'bday': '1968-07-05',
    'fn': 'Susan Wojcicki',
    'title': 'Former CEO of YouTube'}],
  'person': [{'role': 'Former CEO of YouTube'}],
  'met

In [37]:
import json

In [38]:
for res in search_results['items'][:10]:
    result = {}
    result['Title'] = res['title']
    result['Link'] = res['link']
    result['Snippet'] = res['snippet']
    with open('wojcicki.json', 'a+') as file:
        json.dump(result, file)
        file.write('\n')

In [39]:
result_list = []
with open('cases.json', 'r') as file:
    for line in file:
        result = json.loads(line)
        result_list.append(result)
print(result_list)

[{'Title': 'CASES: Home', 'Link': 'https://www.cases.org/', 'Snippet': 'From court and program offices in the Bronx, Brooklyn, and Manhattan, CASES provides community-based alternatives to jail and prison. These programs address\xa0...'}, {'Title': 'cases | International Criminal Court', 'Link': 'https://www.icc-cpi.int/cases', 'Snippet': 'Banda. At large. Case also involved Saleh Mohammed Jerbo Jamus but proceedings against him were terminated on 4 October 2013 following his passing. On 7 March\xa0...'}, {'Title': '2023-2024 Term | Oyez', 'Link': 'https://www.oyez.org/cases', 'Snippet': 'US Supreme Court cases from the 2023-2024 term.'}, {'Title': 'CASETiFY: Show Your Colors', 'Link': 'https://www.casetify.com/', 'Snippet': 'CASETiFY - We make the most sustainable yet protective phone cases for iPhone 15 / iPhone 15 Pro / iPhone 15 Plus / iPhone 15 Pro Max and tech accessories.'}, {'Title': 'COVID - Coronavirus Statistics - Worldometer', 'Link': 'https://www.worldometers.info/coronavi

In [58]:
import regex as re
pattern = re.compile(r'''
’s|’t|’re|’ve|’m|’ll|’d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+”
''')

def extract_words(result_list):
    word_lists = []
    vocab = set()
    for result in result_list:
        title = result.get('Title', '')
        snippet = result.get('Snippet', '')
        title_words = re.findall(pattern, title.lower())
        snippet_words = re.findall(pattern, snippet.lower())
        vocab.update(title_words + snippet_words)
        word_lists.append({'title': title_words, 'snippet': snippet_words})
    return word_lists, vocab

In [60]:
word_list, vocab = extract_words(result_list)

In [63]:
word_list

[{'title': ['cases', ':', ' home'],
  'snippet': ['from',
   ' court',
   ' and',
   ' program',
   ' offices',
   ' in',
   ' the',
   ' bronx',
   ',',
   ' brooklyn',
   ',',
   ' and',
   ' manhattan',
   ',',
   ' cases',
   ' provides',
   ' community',
   '-',
   'based',
   ' alternatives',
   ' to',
   ' jail',
   ' and',
   ' prison',
   '.',
   ' these',
   ' programs',
   ' address',
   '...']},
 {'title': ['cases', ' |', ' international', ' criminal', ' court'],
  'snippet': ['banda',
   '.',
   ' at',
   ' large',
   '.',
   ' case',
   ' also',
   ' involved',
   ' saleh',
   ' mohammed',
   ' jerbo',
   ' jamus',
   ' but',
   ' proceedings',
   ' against',
   ' him',
   ' were',
   ' terminated',
   ' on',
   ' 4',
   ' october',
   ' 2013',
   ' following',
   ' his',
   ' passing',
   '.',
   ' on',
   ' 7',
   ' march',
   '...']},
 {'title': ['2023', '-', '2024', ' term', ' |', ' oyez'],
  'snippet': ['us',
   ' supreme',
   ' court',
   ' cases',
   ' from',
   ' 