In [1]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Load .env variables and initialize apifyclient with API token
load_dotenv('.env')
APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

# Initalize apify client
client = ApifyClient(APIFY_API_KEY)

user_query = input("Enter user query: ")

# prepare google search actor input
run_input = {
      "queries": user_query,
      "maxPagesPerQuery": 1,
      "resultsPerPage": 5,
      "mobileResults": False,
      "languageCode": "",
      "maxConcurrency": 10,
      "saveHtml": False,
      "saveHtmlToKeyValueStore": False,
      "includeUnfilteredResults": False,
      "customDataFunction": """async ({ input, $, request, response, html }) => {
    return {
      pageTitle: $('title').text(),
    };
  };""",
}

# Run
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

url_list = []

# Creating a url list of websites for content crawling
for item in client.dataset(run["defaultDatasetId"]).list_items().items[0]['organicResults']:
   url_list.append(item['url'])

# prepare website content crawler actor input
# for now only scraping the first link
content_crawler_input = {
    "startUrls": [{ "url": url_list[0]}],
    "useSitemaps": False,
    "crawlerType": "playwright:firefox",
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "ignoreCanonicalUrl": False,
    "maxCrawlDepth": 20,
    "maxCrawlPages": 10,
    "initialConcurrency": 0,
    "maxConcurrency": 200,
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "maxSessionRotations": 10,
    "maxRequestRetries": 3,
    "requestTimeoutSecs": 60,
    "dynamicContentWaitSecs": 10,
    "maxScrollHeightPixels": 5000,
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "removeCookieWarnings": True,
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
    "htmlTransformer": "readableText",
    "readableTextCharThreshold": 100,
    "aggressivePrune": False,
    "debugMode": False,
    "debugLog": False,
    "saveHtml": False,
    "saveMarkdown": True,
    "saveFiles": False,
    "saveScreenshots": False,
    "maxResults": 9999999,
    "clientSideMinChangePercentage": 15,
    "renderingTypeDetectionPercentage": 10,
}

# Run the content crawler actor and wait for it to finish
website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(website_content_crawler["defaultDatasetId"]).iterate_items():
    print(item)

{'url': 'https://www.britannica.com/topic/Haitian-Revolution', 'crawl': {'loadedUrl': 'https://www.britannica.com/topic/Haitian-Revolution', 'loadedTime': '2024-03-07T23:53:52.628Z', 'referrerUrl': 'https://www.britannica.com/topic/Haitian-Revolution', 'depth': 0, 'httpStatusCode': 200}, 'metadata': {'canonicalUrl': 'https://www.britannica.com/topic/Haitian-Revolution', 'title': 'Haitian Revolution | Causes, Summary, & Facts | Britannica', 'description': 'Haitian Revolution (1791–1804), series of conflicts between Haitian slaves, colonists, the armies of the British and French colonizers, and a number of other parties. Through the struggle, the Haitian people ultimately won independence from France and thereby became the first country to be founded by former slaves.', 'author': None, 'keywords': 'Haitian Revolution, encyclopedia, encyclopeadia, britannica, article', 'languageCode': 'en', 'openGraph': [{'property': 'og:type', 'content': 'ARTICLE'}, {'property': 'og:title', 'content': 'H

In [7]:
# Loads the dataset into langchain document format
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document

loader = ApifyDatasetLoader(
    dataset_id= website_content_crawler["defaultDatasetId"],
    dataset_mapping_function=lambda dataset_item: Document(
        page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
    ),
)

docs = loader.load()

documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)

# Stores in FAISS vector
vector = FAISS.from_documents(documents, OpenAIEmbeddings())

In [6]:
from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator().from_loaders([loader])

# Code for getting summaries from our dataset
user_query = input("Enter user query: ")
result = index.query_with_sources(user_query)

print(result["answer"])
print(result["sources"])

 The French were heavily involved in the Haitian Revolution.

https://www.britannica.com/topic/Haitian-Revolution, https://www.britannica.com/topic/Haitian-Revolution/images-videos, https://www.britannica.com/topic/Haitian-Revolution/additional-info
