In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.environ.get('GOOGLE_API_KEY')
engine_id = os.environ.get('SEARCH_ENGINE_ID')

In [2]:
feature_mapping = {'link': 'URL',
                    'title': 'Title',
                    'snippet': 'Summary'}

In [3]:
set(feature_mapping.keys())

{'link', 'snippet', 'title'}

In [4]:
search_string = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}&q='cases'"

In [5]:
import requests

In [6]:
response = requests.get(search_string)

In [7]:
response

<Response [200]>

In [8]:
search_results =  response.json()

In [9]:
search_results

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': "Google Custom Search - 'cases'",
    'totalResults': '17620000000',
    'searchTerms': "'cases'",
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'utf8',
    'safe': 'off',
    'c

In [10]:
search_results['items'][0]

{'kind': 'customsearch#result',
 'title': 'CASES: Home',
 'htmlTitle': '<b>CASES</b>: Home',
 'link': 'https://www.cases.org/',
 'displayLink': 'www.cases.org',
 'snippet': 'From court and program offices in the Bronx, Brooklyn, and Manhattan, CASES provides community-based alternatives to jail and prison. These programs address\xa0...',
 'htmlSnippet': 'From court and program offices in the Bronx, Brooklyn, and Manhattan, <b>CASES</b> provides community-based alternatives to jail and prison. These programs address&nbsp;...',
 'cacheId': 'oGi_vvQHU1cJ',
 'formattedUrl': 'https://www.cases.org/',
 'htmlFormattedUrl': 'https://www.<b>cases</b>.org/',
 'pagemap': {'cse_thumbnail': [{'src': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQRF25kiadwOBckOt3dy-eOzuOJIAJjzu_ePaHVSK4PJ83rhMLzA-5S0g',
    'width': '551',
    'height': '59'}],
  'metatags': [{'og:image': 'https://www.cases.org/wp-content/uploads/2016/08/CASES-Logo-Color.png',
    'og:type': 'article',
    'article:publishe

In [11]:
set(search_results['items'][0].keys())

{'cacheId',
 'displayLink',
 'formattedUrl',
 'htmlFormattedUrl',
 'htmlSnippet',
 'htmlTitle',
 'kind',
 'link',
 'pagemap',
 'snippet',
 'title'}

In [13]:
set(feature_mapping.keys()) not in set(search_results['items'][0].keys())

True

In [16]:
for i in range(10):
    if not set(feature_mapping.keys()).issubset(set(search_results['items'][i].keys())):
        print("QUERY ERROR: Features missing in the response. Check feature_mapping.")

In [37]:
import json

In [38]:
for res in search_results['items'][:10]:
    result = {}
    result['Title'] = res['title']
    result['Link'] = res['link']
    result['Snippet'] = res['snippet']
    with open('wojcicki.json', 'a+') as file:
        json.dump(result, file)
        file.write('\n')

In [39]:
result_list = []
with open('cases.json', 'r') as file:
    for line in file:
        result = json.loads(line)
        result_list.append(result)
print(result_list)

[{'Title': 'CASES: Home', 'Link': 'https://www.cases.org/', 'Snippet': 'From court and program offices in the Bronx, Brooklyn, and Manhattan, CASES provides community-based alternatives to jail and prison. These programs address\xa0...'}, {'Title': 'cases | International Criminal Court', 'Link': 'https://www.icc-cpi.int/cases', 'Snippet': 'Banda. At large. Case also involved Saleh Mohammed Jerbo Jamus but proceedings against him were terminated on 4 October 2013 following his passing. On 7 March\xa0...'}, {'Title': '2023-2024 Term | Oyez', 'Link': 'https://www.oyez.org/cases', 'Snippet': 'US Supreme Court cases from the 2023-2024 term.'}, {'Title': 'CASETiFY: Show Your Colors', 'Link': 'https://www.casetify.com/', 'Snippet': 'CASETiFY - We make the most sustainable yet protective phone cases for iPhone 15 / iPhone 15 Pro / iPhone 15 Plus / iPhone 15 Pro Max and tech accessories.'}, {'Title': 'COVID - Coronavirus Statistics - Worldometer', 'Link': 'https://www.worldometers.info/coronavi

In [58]:
import regex as re
pattern = re.compile(r'''
’s|’t|’re|’ve|’m|’ll|’d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+”
''')

def extract_words(result_list):
    word_lists = []
    vocab = set()
    for result in result_list:
        title = result.get('Title', '')
        snippet = result.get('Snippet', '')
        title_words = re.findall(pattern, title.lower())
        snippet_words = re.findall(pattern, snippet.lower())
        vocab.update(title_words + snippet_words)
        word_lists.append({'title': title_words, 'snippet': snippet_words})
    return word_lists, vocab

In [60]:
word_list, vocab = extract_words(result_list)

In [63]:
word_list

[{'title': ['cases', ':', ' home'],
  'snippet': ['from',
   ' court',
   ' and',
   ' program',
   ' offices',
   ' in',
   ' the',
   ' bronx',
   ',',
   ' brooklyn',
   ',',
   ' and',
   ' manhattan',
   ',',
   ' cases',
   ' provides',
   ' community',
   '-',
   'based',
   ' alternatives',
   ' to',
   ' jail',
   ' and',
   ' prison',
   '.',
   ' these',
   ' programs',
   ' address',
   '...']},
 {'title': ['cases', ' |', ' international', ' criminal', ' court'],
  'snippet': ['banda',
   '.',
   ' at',
   ' large',
   '.',
   ' case',
   ' also',
   ' involved',
   ' saleh',
   ' mohammed',
   ' jerbo',
   ' jamus',
   ' but',
   ' proceedings',
   ' against',
   ' him',
   ' were',
   ' terminated',
   ' on',
   ' 4',
   ' october',
   ' 2013',
   ' following',
   ' his',
   ' passing',
   '.',
   ' on',
   ' 7',
   ' march',
   '...']},
 {'title': ['2023', '-', '2024', ' term', ' |', ' oyez'],
  'snippet': ['us',
   ' supreme',
   ' court',
   ' cases',
   ' from',
   ' 