# Using Langchain Loaders to Capture and Store Search Engine Results

Code and Answers below from ChatGPT 4o 

- Created on: 2024.06.17, 11:00:00 PM


## USER
How do I use Python with the Langchain loader to capture the results from a web search and then parse out all URLs with their search record metadata.  Provide complete examples for Google, DuckDuckGo, and Bing.  Make sure to use the free SERP API only.  I don't want to have to authenticate.

## ASSISTANT
To capture the results from a web search and parse out all URLs with their search record metadata using Python with LangChain, you can use the free SERP API. Here’s how you can do it for Google, DuckDuckGo, and Bing.


In [None]:

### Prerequisites
1. **Install necessary libraries:**
   You need to install `requests` for making HTTP requests and `LangChain` for handling the search results.

   ```bash
   pip install requests langchain
   ```

### Google Search Example

1. **Set up the SERP API endpoint and parameters:**


## Configuration

***Replace `"YOUR_API_KEY"` with the actual free API key you get from SERP API.***

This example demonstrates how to perform searches on Google, DuckDuckGo, and Bing 
using the free SERP API, parse the results, and integrate it with LangChain for a custom loader.

In [None]:
import requests

import os 
serpapi_api_key = os.environ["serpapi_api_key"]

def google_search(query, serpapi_api_key):
    url = "https://serpapi.com/search"
    params = {
        "q": query,
        "location": "United States",
        "hl": "en",
        "gl": "us",
        "api_key": serpapi_api_key  # Use the free key provided by SERP API
    }
    response = requests.get(url, params=params)
    #print(f"Request URL: {response.url}")
    print(f"Status Code: {response.status_code}")
    return response.json()

def parse_google_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

from datetime import datetime
import json

def save_results_to_file(query, results):
    # Replace spaces with underscores and remove characters not allowed in Windows filenames
    valid_filename = query.replace(" ", "_").translate({ord(c): None for c in '<>:"/\\|?*'})
    
    # Append the current datetime stamp to the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%Mh")
    filename = f"{valid_filename}_{timestamp}.json"

    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)
    print(f"Results saved to {filename}")
    
if __name__ == "__main__":
    query = "Python web scraping"
    results = google_search(query, serpapi_api_key)
    parsed_results = parse_google_results(results)

    # Save the results to a JSON file
    save_results_to_file(query, results)
    
    print(f"Total results: {len(parsed_results)}")
    for result in parsed_results:
        print(result)
        
        
# TODO: check monthly account usage so far: 
# TODO:     https://serpapi.com/account?api_key=SECRET_API_KEY check monthly account usage so far.

## V2 query for "using Python langchain Search Results Loader youtube.com"

This version only returns 10 results per page.  

In [None]:
import os 
import json
import requests
from datetime import datetime
serpapi_api_key = os.environ["serpapi_api_key"]

def google_search(query, serpapi_api_key):
    url = "https://serpapi.com/search"
    params = {
        "q": query,
        "location": "United States",
        "hl": "en",
        "gl": "us",
        "api_key": serpapi_api_key  # Use the free key provided by SERP API
    }
    response = requests.get(url, params=params)
    #print(f"Request URL: {response.url}\n")
    print(f"Status Code: {response.status_code}\n")
    return response.json()

def parse_google_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

def save_results_to_file(query, results):
    # Replace spaces with underscores and remove characters not allowed in Windows filenames
    valid_filename = query.replace(" ", "_").translate({ord(c): None for c in '<>:"/\\|?*'})
    
    # Append the current datetime stamp to the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%Mh")
    filename = f"{valid_filename}_{timestamp}.json"

    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)
    print(f"Results saved to {filename}\n")
    
if __name__ == "__main__":
    # query = "Python web scraping"
    query = "using Python langchain Search Results Loader youtube.com"
    
    # response.json() is returned from google_search() and assigned to `results`
    results = google_search(query, serpapi_api_key)     
    parsed_results = parse_google_results(results)

    # Save the results to a JSON file
    save_results_to_file(query, results)
    
    print(f"Total results: {len(parsed_results)}\n")
    for result in parsed_results:
        print(result)
        
        
# TODO: check monthly account usage so far: 
# TODO:     https://serpapi.com/account?api_key=SECRET_API_KEY check monthly account usage so far.

# V3 - Count Search Results

In [None]:
import requests

import os 
serpapi_api_key = os.environ["serpapi_api_key"]

def google_search_results_count(query, serpapi_api_key):
    url = "https://serpapi.com/search"
    params = {"q": query, "location": "United States", "hl": "en", "gl": "us",
        "api_key": serpapi_api_key  # Use the free key provided by SERP API
    }
    response = requests.get(url, params=params)
    #print(f"Request URL: {response.url}")
    print(f"Status Code: {response.status_code}")
    
    if response.status_code == 200:
        response_json = response.json()
        results_count = response_json.get('search_information', {}).get('total_results')
        print(f"Number of search results returned: {results_count}")
        return response_json, results_count
    else:
        return {}, 0

def parse_google_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

from datetime import datetime
import json

def save_results_to_file(query, results):
    # Replace spaces with underscores and remove characters not allowed in Windows filenames
    valid_filename = query.replace(" ", "_").translate({ord(c): None for c in '<>:"/\\|?*'})
    
    # Append the current datetime stamp to the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%Mh")
    filename = f"{valid_filename}_{timestamp}.json"

    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)
    print(f"Results saved to {filename}")
    
if __name__ == "__main__":
    # query = "Python web scraping"
    query = "using Python langchain Search Results Loader youtube.com"
    
    response_json, results_count = google_search_results_count(query, serpapi_api_key)

    parsed_results = parse_google_results(results)

    # Save the results to a JSON file
    save_results_to_file(query, results)
    
    print(f"Total results: {len(parsed_results)}")
    for result in parsed_results:
        print(result)
        
        
# TODO: check monthly account usage so far: 
# TODO:     https://serpapi.com/account?api_key=SECRET_API_KEY check monthly account usage so far.

## Avoid transmitting API keys in the URL endpoint for APIs

Use alternative methods such as setting the API key in HTTP headers or using POST requests with the API key in the body, depending on what the API supports. However, it's important to check the specific API documentation because not all APIs support these methods. For SerpApi and many others, the common method is to use the API key as a query parameter, but we'll explore a general approach using headers for APIs that support it.

### General Approach Using HTTP Headers
- Check API Documentation: First, verify in the API documentation if it allows authentication through HTTP headers.

- Use the Authorization Header: If supported, you can typically use the Authorization header to pass the API key or token.

#### Example with Requests Library in Python
If the API supports passing the API key through headers, you can modify your request like this:

In [None]:
import requests

import os 
serpapi_api_key = os.environ["serpapi_api_key"]

def google_search_results_count(query, serpapi_api_key):
    url = "https://serpapi.com/search"
    headers = {
        "Authorization": f"Bearer {serpapi_api_key}"
    }
    params = {
        "q": query, 
        "location": "United States", 
        "hl": "en", 
        "gl": "us"
    }
    response = requests.get(url, headers=headers, params=params)
    print(f"Status Code: {response.status_code}")
    
    if response.status_code == 200:
        response_json = response.json()
        results_count = response_json.get('search_information', {}).get('total_results')
        print(f"Number of search results returned: {results_count}")
        return response_json, results_count
    else:
        return {}, 0

def parse_google_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

from datetime import datetime
import json

def save_results_to_file(query, results):
    # Replace spaces with underscores and remove characters not allowed in Windows filenames
    valid_filename = query.replace(" ", "_").translate({ord(c): None for c in '<>:"/\\|?*'})
    
    # Append the current datetime stamp to the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%Mh")
    filename = f"{valid_filename}_{timestamp}.json"

    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)
    print(f"Results saved to {filename}")
    
if __name__ == "__main__":
    # query = "Python web scraping"
    query = "using Python langchain Search Results Loader youtube.com"
    
    response_json, results_count = google_search_results_count(query, serpapi_api_key)

    parsed_results = parse_google_results(results)

    # Save the results to a JSON file
    save_results_to_file(query, results)
    
    print(f"Total results: {len(parsed_results)}")
    for result in parsed_results:
        print(result)
        
        
# TODO: check monthly account usage so far: 
# TODO:     https://serpapi.com/account?api_key=SECRET_API_KEY check monthly account usage so far.

In [None]:
### TODO: DuckDuckGo Search Example

##### 1. **Set up the SERP API endpoint and parameters:**

import requests

def duckduckgo_search(query):
    url = "https://serpapi.com/search"
    params = {
        "q": query,
        "location": "United States",
        "hl": "en",
        "gl": "us",
        "api_key": "YOUR_API_KEY",
        "engine": "duckduckgo"
    }
    response = requests.get(url, params=params)
    return response.json()

def parse_duckduckgo_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

if __name__ == "__main__":
    query = "Python web scraping"
    results = duckduckgo_search(query)
    parsed_results = parse_duckduckgo_results(results)
    for result in parsed_results:
        print(result)
```


In [None]:

### TODO: Bing Search Example

# 1. **Set up the SERP API endpoint and parameters:**

import requests

def bing_search(query):
    url = "https://serpapi.com/search"
    params = {
        "q": query,
        "location": "United States",
        "hl": "en",
        "gl": "us",
        "api_key": "YOUR_API_KEY",
        "engine": "bing"
    }
    response = requests.get(url, params=params)
    return response.json()

def parse_bing_results(results):
    parsed_results = []
    for result in results.get('organic_results', []):
        parsed_results.append({
            'title': result.get('title'),
            'link': result.get('link'),
            'snippet': result.get('snippet')
        })
    return parsed_results

if __name__ == "__main__":
    query = "Python web scraping"
    results = bing_search(query)
    parsed_results = parse_bing_results(results)
    for result in parsed_results:
        print(result)




In [None]:
## Complete Example with LangChain Integration

### To integrate this with LangChain, you need to create a custom `Loader` class that uses these search functions.

#### 1. **Create the custom loader:**

from langchain.loaders import BaseLoader

class SERPLoader(BaseLoader):
    def __init__(self, engine, query):
        self.engine = engine
        self.query = query
    def load(self):
        if self.engine == "google":
            return google_search(self.query)
        elif self.engine == "duckduckgo":
            return duckduckgo_search(self.query)
        elif self.engine == "bing":
            return bing_search(self.query)
        else:
            raise ValueError("Unsupported search engine")
    def parse(self, results):
        if self.engine == "google":
            return parse_google_results(results)
        elif self.engine == "duckduckgo":
            return parse_duckduckgo_results(results)
        elif self.engine == "bing":
            return parse_bing_results(results)
        else:
            raise ValueError("Unsupported search engine")

if __name__ == "__main__":
    loader = SERPLoader(engine="google", query="Python web scraping")
    results = loader.load()
    parsed_results = loader.parse(results)
    for result in parsed_results:
        print(result)


# Use LangChain loader to get search engine results without SERP API

## USER
Is there another langchain loader to scrape results from search engines that does not use SERPAPI.com ?

## ASSISTANT
Yes, you can use the LangChain loader to scrape results directly from search engines without using SERP APIa. One common approach is to use BeautifulSoup along with requests to scrape the search engine results page (SERP). Here's how you can create custom loaders for Google, DuckDuckGo, and Bing.

### Prerequisites
1. **Install necessary libraries:**
   You need to install `requests`, `beautifulsoup4`, and `LangChain`.

   ```bash
   pip install requests beautifulsoup4 langchain
   ```


In [3]:
## Google Search Example

### 1. **Set up the Google search loader:**

import requests
from bs4 import BeautifulSoup
#from langchain.loaders import BaseLoader
from langchain.document_loaders.base import BaseLoader



class GoogleSearchLoader(BaseLoader):
    def __init__(self, query):
        self.query = query
    def load(self):
        url = f"https://www.google.com/search?q={self.query}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
        response = requests.get(url, headers=headers)
        return response.text
    def parse(self, html):
        soup = BeautifulSoup(html, "html.parser")
        results = []
        for g in soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd'):
            title = g.get_text()
            link = g.find_parent('a')['href']
            snippet = g.find_next_sibling('div').get_text()
            results.append({
                'title': title,
                'link': link,
                'snippet': snippet
            })
        return results
if __name__ == "__main__":
    loader = GoogleSearchLoader(query="Python web scraping")
    html = loader.load()
    parsed_results = loader.parse(html)
    for result in parsed_results:
        print(result)



In [None]:

### DuckDuckGo Search Example

1. **Set up the DuckDuckGo search loader:**

   ```python
   import requests
   from bs4 import BeautifulSoup
   from langchain.loaders import BaseLoader

   class DuckDuckGoSearchLoader(BaseLoader):
       def __init__(self, query):
           self.query = query

       def load(self):
           url = f"https://duckduckgo.com/html/?q={self.query}"
           headers = {
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
           response = requests.get(url, headers=headers)
           return response.text

       def parse(self, html):
           soup = BeautifulSoup(html, "html.parser")
           results = []
           for result in soup.find_all('a', class_='result__a'):
               title = result.get_text()
               link = result['href']
               snippet = result.find_next('a', class_='result__snippet').get_text() if result.find_next('a', class_='result__snippet') else ''
               results.append({
                   'title': title,
                   'link': link,
                   'snippet': snippet
               })
           return results

   if __name__ == "__main__":
       loader = DuckDuckGoSearchLoader(query="Python web scraping")
       html = loader.load()
       parsed_results = loader.parse(html)
       for result in parsed_results:
           print(result)
   ```

### Bing Search Example

1. **Set up the Bing search loader:**

   ```python
   import requests
   from bs4 import BeautifulSoup
   from langchain.loaders import BaseLoader

   class BingSearchLoader(BaseLoader):
       def __init__(self, query):
           self.query = query

       def load(self):
           url = f"https://www.bing.com/search?q={self.query}"
           headers = {
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
           response = requests.get(url, headers=headers)
           return response.text

       def parse(self, html):
           soup = BeautifulSoup(html, "html.parser")
           results = []
           for result in soup.find_all('li', class_='b_algo'):
               title = result.find('h2').get_text()
               link = result.find('a')['href']
               snippet = result.find('p').get_text() if result.find('p') else ''
               results.append({
                   'title': title,
                   'link': link,
                   'snippet': snippet
               })
           return results

   if __name__ == "__main__":
       loader = BingSearchLoader(query="Python web scraping")
       html = loader.load()
       parsed_results = loader.parse(html)
       for result in parsed_results:
           print(result)
   ```

### Complete Example with LangChain Integration

To integrate this with LangChain, you can use the custom loaders created above. Here’s how to use one of them with LangChain:

```python
from langchain.loaders import BaseLoader

class CustomLoader(BaseLoader):
    def __init__(self, engine, query):
        self.engine = engine
        self.query = query

    def load(self):
        if self.engine == "google":
            return GoogleSearchLoader(self.query).load()
        elif self.engine == "duckduckgo":
            return DuckDuckGoSearchLoader(self.query).load()
        elif self.engine == "bing":
            return BingSearchLoader(self.query).load()
        else:
            raise ValueError("Unsupported search engine")

    def parse(self, html):
        if self.engine == "google":
            return GoogleSearchLoader(self.query).parse(html)
        elif self.engine == "duckduckgo":
            return DuckDuckGoSearchLoader(self.query).parse(html)
        elif self.engine == "bing":
            return BingSearchLoader(self.query).parse(html)
        else:
            raise ValueError("Unsupported search engine")

if __name__ == "__main__":
    loader = CustomLoader(engine="google", query="Python web scraping")
    html = loader.load()
    parsed_results = loader.parse(html)
    for result in parsed_results:
        print(result)
```

This example demonstrates how to perform searches on Google, DuckDuckGo, and Bing using web scraping techniques, parse the results, and integrate them with LangChain for a custom loader. Make sure to handle the search engine's terms of service and usage policies regarding web scraping.