In [2]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document
from langchain_core.messages import HumanMessage

In [6]:
load_dotenv('.env')

def google_search_actor(question):
    # Returns a list of urls from a question on google search using apify's google search actor
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)

    # prepare google search actor input
    run_input = {
        "queries": question,
        "maxPagesPerQuery": 1,
        "resultsPerPage": 2,
        "mobileResults": False,
        "languageCode": "",
        "maxConcurrency": 10,
        "saveHtml": False,
        "saveHtmlToKeyValueStore": False,
        "includeUnfilteredResults": False,
        "customDataFunction": """async ({ input, $, request, response, html }) => {
        return {
        pageTitle: $('title').text(),
        };
    };""",
    }

    # running the actor
    serp = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

    url_list = []

    # Creating a url list of websites for content crawling
    for item in client.dataset(serp["defaultDatasetId"]).list_items().items[0]['organicResults']:
        url_list.append(item['url'])
        print("Adding url to url list:", item)

    print(len(url_list), "urls")

    return url_list

def content_crawler(urls):
    # Takes a list of urls and web scrapes using Apify's content crawler
    # Load .env variables and initialize apifyclient with API token
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)
    
    content_crawler_input = {
        "startUrls": [{"url": url} for url in urls],
        "useSitemaps": False,
        "crawlerType": "playwright:firefox",
        "includeUrlGlobs": [],
        "excludeUrlGlobs": [{ "url": "https://www.britannica.com"}],
        "ignoreCanonicalUrl": False,
        "maxCrawlDepth": 5,
        "maxCrawlPages": 10,
        "initialConcurrency": 0,
        "maxConcurrency": 200,
        "initialCookies": [],
        "proxyConfiguration": { "useApifyProxy": True },
        "maxSessionRotations": 10,
        "maxRequestRetries": 3,
        "requestTimeoutSecs": 60,
        "dynamicContentWaitSecs": 10,
        "maxScrollHeightPixels": 5000,
        "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
        [role=\"alert\"],
        [role=\"banner\"],
        [role=\"dialog\"],
        [role=\"alertdialog\"],
        [role=\"region\"][aria-label*=\"skip\" i],
        [aria-modal=\"true\"]""",
        "removeCookieWarnings": True,
        "clickElementsCssSelector": "[aria-expanded=\"false\"]",
        "htmlTransformer": "readableText",
        "readableTextCharThreshold": 100,
        "aggressivePrune": False,
        "debugMode": False,
        "debugLog": False,
        "saveHtml": False,
        "saveMarkdown": True,
        "saveFiles": False,
        "saveScreenshots": False,
        "maxResults": 9999999,
        "clientSideMinChangePercentage": 15,
        "renderingTypeDetectionPercentage": 10,
    }

    # Run the content crawler actor and wait for it to finish
    website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

    # Loads the dataset into langchain document format
    loader = ApifyDatasetLoader(
        dataset_id= website_content_crawler["defaultDatasetId"],
        dataset_mapping_function=lambda dataset_item: Document(
            page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
        ),
    )

    docs = loader.load()

    return docs


def run_actors(query):
    chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
    output = chat.invoke(
        [
            HumanMessage(
                content = f"""You are an AI language model assistant. Your task is to generate two 
        different versions of the given user question to retrieve relevant documents from a vector 
        database. By generating multiple perspectives on the user question, your goal is to help
        the user overcome some of the limitations of the distance-based similarity search. 
        Provide these alternative Google Search prompts separated by newlines.
        Query: {query}"""
            )
        ]
    )

    output_string = output.content
    output_list = output_string.split('\n')

    for i in range(len(output_list)):
        output_list[i] = output_list[i][3:]

    doc_list = []

    for question in output_list:
        print("Querying:", question)
        urls_list = google_search_actor(question)
        doc_list.append(content_crawler(urls_list))
    
    return doc_list

In [7]:
user_query = input("Give me a question:" )

x = run_actors(user_query)

Querying: "Explore the impact of Steve Jobs on the technology industry."
Adding url to url list: {'title': 'How Steve Jobs Changed the World', 'url': 'https://www.investopedia.com/articles/personal-finance/012815/how-steve-jobs-changed-world.asp', 'displayedUrl': 'https://www.investopedia.com › ... › Entrepreneurs', 'description': "From purchasing Pixar in 1986 to supporting charities and environmental causes, Jobs' achievements and innovations continue to affect industries and lifestyles\xa0...", 'emphasizedKeywords': ['affect industries and lifestyles'], 'siteLinks': [], 'productInfo': {}, 'type': 'organic', 'position': 1}
Adding url to url list: {'title': "10 Lessons From the Legacy of Apple's Steve Jobs", 'url': 'https://spectrum.ieee.org/10-lessons-from-steve-jobs', 'displayedUrl': 'https://spectrum.ieee.org › 10-lessons-from-steve-jobs', 'description': "Jobs's innovations made a profound impact. He redefined computing, enhancing the user experience, and created products and servi

In [None]:
# Load .env variables and initialize apifyclient with API token
load_dotenv('.env')
APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

# Initalize apify client
client = ApifyClient(APIFY_API_KEY)

# prepare google search actor input
run_input = {
      "queries": question,
      "maxPagesPerQuery": 1,
      "resultsPerPage": 5,
      "mobileResults": False,
      "languageCode": "",
      "maxConcurrency": 10,
      "saveHtml": False,
      "saveHtmlToKeyValueStore": False,
      "includeUnfilteredResults": False,
      "customDataFunction": """async ({ input, $, request, response, html }) => {
    return {
      pageTitle: $('title').text(),
    };
  };""",
}

serp = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

url_list = []

# Creating a url list of websites for content crawling
for item in client.dataset(serp["defaultDatasetId"]).list_items().items[0]['organicResults']:
   url_list.append(item['url'])

# prepare website content crawler actor input
# for now only scraping the first link
content_crawler_input = {
    "startUrls": [{ "url": url_list[0]}],
    "useSitemaps": False,
    "crawlerType": "playwright:firefox",
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "ignoreCanonicalUrl": False,
    "maxCrawlDepth": 20,
    "maxCrawlPages": 10,
    "initialConcurrency": 0,
    "maxConcurrency": 200,
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "maxSessionRotations": 10,
    "maxRequestRetries": 3,
    "requestTimeoutSecs": 60,
    "dynamicContentWaitSecs": 10,
    "maxScrollHeightPixels": 5000,
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "removeCookieWarnings": True,
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
    "htmlTransformer": "readableText",
    "readableTextCharThreshold": 100,
    "aggressivePrune": False,
    "debugMode": False,
    "debugLog": False,
    "saveHtml": False,
    "saveMarkdown": True,
    "saveFiles": False,
    "saveScreenshots": False,
    "maxResults": 9999999,
    "clientSideMinChangePercentage": 15,
    "renderingTypeDetectionPercentage": 10,
}

# Run the content crawler actor and wait for it to finish
website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

# Loads the dataset into langchain document format
loader = ApifyDatasetLoader(
    dataset_id= website_content_crawler["defaultDatasetId"],
    dataset_mapping_function=lambda dataset_item: Document(
        page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
    ),
)

documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(loader.load())

In [4]:
# Stores in FAISS vector
vector = FAISS.from_documents(documents, OpenAIEmbeddings())

NameError: name 'documents' is not defined

In [None]:
from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator().from_loaders([loader])

# Code for getting summaries from our dataset
user_query = input("Enter user query: ")
result = index.query_with_sources(user_query)

print(result["answer"])
print(result["sources"])

In [18]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')
client = ApifyClient(APIFY_API_KEY)

from apify_client import ApifyClient

# Initialize the ApifyClient with your Apify API token
client = ApifyClient("<YOUR_API_TOKEN>")

# Prepare the Actor input
run_input = {
    "startUrls": [{ "url": "https://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs" }],
    "isUrlArticleDefinition": {
        "minDashes": 4,
        "hasDate": True,
        "linkIncludes": [
            "article",
            "storyid",
            "?p=",
            "id=",
            "/fpss/track",
            ".html",
            "/content/",
        ],
    },
    "proxyConfiguration": { "useApifyProxy": True },
    "extendOutputFunction": """($) => {
    const result = {};
    // Uncomment to add a title to the output
    // result.pageTitle = $('title').text().trim();

    return result;
}""",
}

# Run the Actor and wait for it to finish
run = client.actor("lukaskrivka/article-extractor-smart").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)

ApifyApiError: User was not found or authentication token is not valid