In [48]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document
from langchain_core.messages import HumanMessage
from langchain.indexes import VectorstoreIndexCreator

In [117]:
load_dotenv('.env')

def google_search_actor(question):
    # Returns a list of urls from a question on google search using apify's google search actor
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)

    # prepare google search actor input
    run_input = {
        "queries": question,
        "maxPagesPerQuery": 3,
        "resultsPerPage": 3,
        "mobileResults": False,
        "languageCode": "",
        "maxConcurrency": 10,
        "saveHtml": False,
        "saveHtmlToKeyValueStore": False,
        "includeUnfilteredResults": False,
        "customDataFunction": """async ({ input, $, request, response, html }) => {
        return {
        pageTitle: $('title').text(),
        };
    };""",
    }

    # running the actor
    serp = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

    url_list = []
    
    # Creating a url list of websites for content crawling
    for item in client.dataset(serp["defaultDatasetId"]).list_items().items[0]['organicResults']:
        if len(item) > 0:
            url_list.append(item['url'])

    print(len(url_list), "urls")

    return url_list

In [121]:
def content_crawler(urls):
    # Takes a list of urls and web scrapes using Apify's content crawler
    # Load .env variables and initialize apifyclient with API token
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)
    
    content_crawler_input = {
        "startUrls": [],
        "useSitemaps": False,
        "crawlerType": "playwright:firefox",
        "includeUrlGlobs": [],
        "excludeUrlGlobs": [{ "url": "https://www.britannica.com"}],
        "ignoreCanonicalUrl": False,
        "maxCrawlDepth": 2,
        "maxCrawlPages": 4,
        "initialConcurrency": 0,
        "maxConcurrency": 200,
        "initialCookies": [],
        "proxyConfiguration": { "useApifyProxy": True },
        "maxSessionRotations": 10,
        "maxRequestRetries": 3,
        "requestTimeoutSecs": 60,
        "dynamicContentWaitSecs": 10,
        "maxScrollHeightPixels": 5000,
        "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
        [role=\"alert\"],
        [role=\"banner\"],
        [role=\"dialog\"],
        [role=\"alertdialog\"],
        [role=\"region\"][aria-label*=\"skip\" i],
        [aria-modal=\"true\"]""",
        "removeCookieWarnings": True,
        "clickElementsCssSelector": "[aria-expanded=\"false\"]",
        "htmlTransformer": "readableText",
        "readableTextCharThreshold": 100,
        "aggressivePrune": False,
        "debugMode": False,
        "debugLog": False,
        "saveHtml": False,
        "saveMarkdown": True,
        "saveFiles": False,
        "saveScreenshots": False,
        "maxResults": 9999999,
        "clientSideMinChangePercentage": 15,
        "renderingTypeDetectionPercentage": 10,
    }

    for url in urls:
        content_crawler_input["startUrls"].append({"url": url})
        print("Scraping", url)

    # Run the content crawler actor and wait for it to finish
    website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

    # Loads the dataset into langchain document format
    loader = ApifyDatasetLoader(
        dataset_id= website_content_crawler["defaultDatasetId"],
        dataset_mapping_function=lambda dataset_item: Document(
            page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
        )
    )

    return loader

In [104]:
def flatten_comprehension(matrix):
    return [item for row in matrix for item in row]

In [120]:
def get_prompts(topic):
    chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)

    from langchain_core.messages import HumanMessage

    output = chat.invoke(
        [
            HumanMessage(
                content = f"""You are an AI language model assistant. Your task is to generate three 
        sets keywords of the given user question to retrieve relevant documents from a vector 
        database. By generating multiple perspectives on the user question, your goal is to help
        the user overcome some of the limitations of the distance-based similarity search. 
        Provide these alternative Google Search prompts separated by newlines. For example, if given an input of "Steve Jobs," 
        you should output things like "Steve Jobs accomplishments," "Steve Jobs biography," and "Steve Jobs innovations." Tailor
        these prompts to he
        lp someone who is working on an academic project, so make them intellectual. Make them broad and concise
        enough so there is still plenty of availability for searching.
        Query: {topic}"""
            )
        ]
    )
    output_string = output.content
    output_list = output_string.split('\n')

    for i in range(len(output_list)):
        output_list[i] = output_list[i][3:]

    for output in output_list:
        print("Generated queries", output)

    return output_list

In [133]:
def run_actors(question):
    prompts = get_prompts(question)

    url_list = []
    for prompt in prompts:
        url_list.append(google_search_actor(prompt))

    flat_url_list = flatten_comprehension(url_list)

    loader = content_crawler(flat_url_list)

    index = VectorstoreIndexCreator().from_loaders([loader])
    return index

In [135]:
# Code for getting summaries from our dataset
# Final function
index = run_actors("Lebron James")

user_query = input("Enter user query: ")
result = index.query_with_sources(user_query)

print(result["answer"])
print(result["sources"])


Generated queries Lebron James basketball career
Generated queries Lebron James impact on sports
Generated queries Lebron James philanthropy and activism
3 urls
2 urls
3 urls
Web Scraping https://en.wikipedia.org/wiki/LeBron_James
Web Scraping https://www.britannica.com/biography/LeBron-James
Web Scraping https://www.espn.com/nba/player/stats/_/id/1966/lebronjames
Web Scraping https://www.espn.com/nba/story/_/id/35653907/beyond-points-winning-lebron-james-legacy-better-worse-empire
Web Scraping https://dailyfreepress.com/2023/10/31/lebron-james-two-decades-of-dominance-on-and-off-the-court-editorial/
Web Scraping https://www.lebronjamesfamilyfoundation.org/
Web Scraping https://www.nba.com/news/lebron-james-off-court-legacy-complements-nba-success
Web Scraping https://www.nytimes.com/2021/03/09/business/lebron-james-community-development.html
 LeBron James has won four championships.

https://www.espn.com/nba/story/_/id/35653907/beyond-points-winning-lebron-james-legacy-better-worse-em

In [130]:
x = loader.load()

documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(x)

# Stores in FAISS vector
vector = FAISS.from_documents(documents, OpenAIEmbeddings())