In [7]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.messages import HumanMessage

# Load .env variables and initialize apifyclient with API token
load_dotenv('.env')
APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

In [None]:
llm = ChatOpenAI(model = "gpt-3.5" , temperature=0)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "What are the approaches to Task Decomposition?"

In [3]:
# Initalize apify client
client = ApifyClient(APIFY_API_KEY)

user_query = input("Enter user query here: ")

# prepare google search actor input
run_input = {
      "queries": user_query,
      "maxPagesPerQuery": 1,
      "resultsPerPage": 5,
      "mobileResults": False,
      "languageCode": "",
      "maxConcurrency": 10,
      "saveHtml": False,
      "saveHtmlToKeyValueStore": False,
      "includeUnfilteredResults": False,
      "customDataFunction": """async ({ input, $, request, response, html }) => {
    return {
      pageTitle: $('title').text(),
    };
  };""",
}

# Run
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

url_list = []

# Creating a url list of websites for content crawling
for item in client.dataset(run["defaultDatasetId"]).list_items().items[0]['organicResults']:
   url_list.append(item['url'])

# prepare website content crawler actor input
# for now only scraping the first link
content_crawler_input = {
    "startUrls": [{ "url": url_list[0]}],
    "useSitemaps": False,
    "crawlerType": "playwright:firefox",
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "ignoreCanonicalUrl": False,
    "maxCrawlDepth": 20,
    "maxCrawlPages": 10,
    "initialConcurrency": 0,
    "maxConcurrency": 200,
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "maxSessionRotations": 10,
    "maxRequestRetries": 3,
    "requestTimeoutSecs": 60,
    "dynamicContentWaitSecs": 10,
    "maxScrollHeightPixels": 5000,
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "removeCookieWarnings": True,
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
    "htmlTransformer": "readableText",
    "readableTextCharThreshold": 100,
    "aggressivePrune": False,
    "debugMode": False,
    "debugLog": False,
    "saveHtml": False,
    "saveMarkdown": True,
    "saveFiles": False,
    "saveScreenshots": False,
    "maxResults": 9999999,
    "clientSideMinChangePercentage": 15,
    "renderingTypeDetectionPercentage": 10,
}

# Run the content crawler actor and wait for it to finish
website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(website_content_crawler["defaultDatasetId"]).iterate_items():
    print(item)

{'url': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation', 'crawl': {'loadedUrl': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation', 'loadedTime': '2024-03-08T02:57:07.431Z', 'referrerUrl': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation', 'depth': 0, 'httpStatusCode': 200}, 'metadata': {'canonicalUrl': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation', 'title': 'Cahn–Hilliard equation - Wikipedia', 'description': None, 'author': None, 'keywords': None, 'languageCode': 'en', 'openGraph': [{'property': 'og:title', 'content': 'Cahn–Hilliard equation - Wikipedia'}, {'property': 'og:type', 'content': 'website'}], 'jsonLd': [{'@context': 'https://schema.org', '@type': 'Article', 'name': 'Cahn–Hilliard equation', 'url': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation', 'sameAs': 'http://www.wikidata.org/entity/Q5017440', 'mainEntity': 'http://www.wikidata.org/entity/Q5017440', 'author': {'@type': 'Organization', 'name': 'Cont

In [4]:
# Loads the dataset into langchain document format
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document

loader = ApifyDatasetLoader(
    dataset_id= website_content_crawler["defaultDatasetId"],
    dataset_mapping_function=lambda dataset_item: Document(
        page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
    ),
)

docs = loader.load()

documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)

# Stores in FAISS vector
vector = FAISS.from_documents(documents, OpenAIEmbeddings())

In [5]:
documents

[Document(page_content='From Wikipedia, the free encyclopedia\nThe Cahn–Hilliard equation (after John W. Cahn and John E. Hilliard)[1] is an equation of mathematical physics which describes the process of phase separation, by which the two components of a binary fluid spontaneously separate and form domains pure in each component. If is the concentration of the fluid, with indicating domains, then the equation is written as \nwhere is a diffusion coefficient with units of and gives the length of the transition regions between the domains. Here is the partial time derivative and is the Laplacian in dimensions. Additionally, the quantity is identified as a chemical potential. \nRelated to it is the Allen–Cahn equation, as well as the stochastic Cahn–Hilliard Equation and the stochastic Allen–Cahn equation. \nFeatures and applications[edit]', metadata={'source': 'https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation'}),
 Document(page_content='Related to it is the Allen–Cahn equati

In [6]:
from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator().from_loaders([loader])

# Code for getting summaries from our dataset
user_query = input("Enter user query: ")
result = index.query_with_sources(user_query)

print(result["answer"])
print(result["sources"])

  warn_deprecated(


 The Cahn-Hilliard equation is a mathematical model used to describe the segregation of binary fluids into domains, with the segregated domains growing in time as a power law. The equation has been studied extensively and has been observed in real experiments and numerical simulations. It has also been applied to other fields, such as elasticity and fluid dynamics. 
https://en.wikipedia.org/wiki/Cahn%E2%80%93Hilliard_equation
