In [1]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "slack.com"
full_url = "https://slack.com/intl/en-in/help/articles"

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

https://slack.com/intl/en-in/help/articles
https://slack.com/enterprise
http://slack.com/demo
https://slack.com/intl/ja-jp/demo?unknown_domain=1
https://slack.com/intl/ja-jp/what-is-slack
https://slack.com/intl/es-la/what-is-slack
https://slack.com/intl/es-la/get
https://slack.com/intl/en-in/downloads/mac
https://slack.com/intl/zh-tw/downloads/mac
https://slack.com/intl/zh-tw/accessibility
https://slack.com/intl/zh-tw/help/articles/360019434914-Use-dark-mode-in-Slack
https://slack.com/intl/zh-tw/help/categories/200122103
https://slack.com/intl/zh-tw/help/articles/360060363633-管理工作空間的待處理邀請和邀請連結
'ascii' codec can't encode characters in position 43-59: ordinal not in range(128)
https://slack.com/intl/zh-tw/help/articles/8328303095443-瞭解-Slack-中的頻道管理員
'ascii' codec can't encode characters in position 44-45: ordinal not in range(128)
https://slack.com/intl/zh-tw/help/articles/115005855543-Slack-的廠商與匯款詳細資料
'ascii' codec can't encode characters in position 49-58: ordinal not in range(128)
htt

KeyboardInterrupt: 

In [6]:
import os
import pandas as pd
import csv

def remove_newlines(text):
    """Remove newlines from text and replace with spaces"""
    if text is None:
        return ""
    text = str(text)
    text = text.replace('\n', ' ')
    text = text.replace('\\n', ' ')
    text = ' '.join(text.split())
    return text

def process_text_files(domain):
    """Process text files for a given domain"""
    
    # Create directories if they don't exist
    os.makedirs(f"text/{domain}", exist_ok=True)
    os.makedirs("processed", exist_ok=True)
    
    # Create a list to store the text files
    texts = []
    
    # Get all the text files in the text directory
    text_dir = f"text/{domain}/"
    for file in os.listdir(text_dir):
        # Skip directories and hidden files
        file_path = os.path.join(text_dir, file)
        if os.path.isfile(file_path) and not file.startswith('.'):
            try:
                # Open the file and read the text
                with open(file_path, "r", encoding='utf-8') as f:
                    text = f.read()
                    
                    # Process the filename - keep the original name without extension
                    # Example: 'slack.com_about_leadership.txt' -> 'slack.com_about_leadership'
                    processed_name = os.path.splitext(file)[0]
                    texts.append((processed_name, text))
                    
            except Exception as e:
                print(f"Error processing file {file}: {str(e)}")
    
    # Create a dataframe from the list of texts
    df = pd.DataFrame(texts, columns=['fname', 'text'])
    
    # Process only the text column, leaving fname as is
    df['text'] = df['text'].apply(remove_newlines)
    
    # Save to CSV with proper escaping
    output_path = 'processed/scraped.csv'
    try:
        df.to_csv(output_path, 
                 index=False,
                 escapechar='\\',
                 doublequote=True,
                 encoding='utf-8',
                 quoting=csv.QUOTE_ALL)
        print(f"Processed {len(texts)} files and saved to {output_path}")
    except Exception as e:
        print(f"Error saving CSV: {str(e)}")
        try:
            # Replace problematic characters
            df['text'] = df['text'].replace(r'[\x00-\x1F\x7F-\x9F]', '', regex=True)
            df.to_csv(output_path,
                     index=False,
                     encoding='utf-8',
                     escapechar='\\')
            print(f"Saved using alternative method to {output_path}")
        except Exception as e2:
            print(f"Both save attempts failed: {str(e2)}")
            df.to_json('processed/scraped.json', orient='records', lines=True)
            print("Saved as JSON instead")
    
    return df

# Usage example
try:
    domain = "slack.com"  # or whatever domain you're processing
    df = process_text_files(domain)
    
    print("\nFirst few rows of the processed data:")
    print(df.head())
    
    print("\nDataset Statistics:")
    print(f"Total documents: {len(df)}")
    print(f"Average document length: {df['text'].str.len().mean():.0f} characters")
    
    # Print some example filenames to verify
    print("\nExample filenames:")
    print(df['fname'].head().tolist())
    
except Exception as e:
    print(f"Error in main execution: {str(e)}")

Processed 69 files and saved to processed/scraped.csv

First few rows of the processed data:
                                               fname  \
0                 slack.com_intl_zh-tw_downloads_mac   
1  slack.com_intl_en-gb_help_requests_new?aid=115...   
2                               slack.com_enterprise   
3  slack.com_help_articles_115002037526-Minimum-r...   
4  slack.com_intl_fr-fr_salesforce-slack-integration   

                                                text  
0  Mac | 下載 | Slack跳到主要內容功能協作頻道整理團隊與工作Slack Conne...  
1  Contact us | SlackSkip to main content MenuSig...  
2  Slack for enterprises | SlackSkip to main cont...  
3  System requirements for using Slack | SlackSki...  
4  Intégrations Slack et Salesforce | SlackPasser...  

Dataset Statistics:
Total documents: 69
Average document length: 8466 characters

Example filenames:
['slack.com_intl_zh-tw_downloads_mac', 'slack.com_intl_en-gb_help_requests_new?aid=115002037526&from_hc=1&src=article', 'slack.com_enterpr

In [8]:
df.head()

Unnamed: 0,fname,text
0,slack.com_intl_zh-tw_downloads_mac,Mac | 下載 | Slack跳到主要內容功能協作頻道整理團隊與工作Slack Conne...
1,slack.com_intl_en-gb_help_requests_new?aid=115...,Contact us | SlackSkip to main content MenuSig...
2,slack.com_enterprise,Slack for enterprises | SlackSkip to main cont...
3,slack.com_help_articles_115002037526-Minimum-r...,System requirements for using Slack | SlackSki...
4,slack.com_intl_fr-fr_salesforce-slack-integration,Intégrations Slack et Salesforce | SlackPasser...


In [9]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="text")

In [11]:
documents = loader.load();


In [19]:
documents[1]

Document(metadata={'fname': 'slack.com_intl_en-gb_help_requests_new?aid=115002037526&from_hc=1&src=article'}, page_content='Contact us | SlackSkip to main content MenuSign inGet startedCreate workspaceGetting startedUsing SlackWorkspace administrationSlack status Contact usAlready use Slack? Sign in so we can tailor your support experience. If that’s not possible, we’d still like to hear from you.Your chat is in progressDon’t worry about taking notes – we’ll send you a copy of the chat at the end of the conversation.Thank you for contacting us!We’ve emailed you a copy of the chat. If you need to get back in touch, you can reply to the email and continue the conversation.VISIT THE HELP CENTRESEE YOUR SUPPORT HISTORYPrivacy policyYour email addressTopicChangeSelect a topic:Audio and videoBilling and subscriptionsConnection problemsManaging channelsManaging membersNotificationsSigning inSlack ConnectOr tell us what you need help with:Clear textRelated questionsRelated articlesCan you give

In [35]:
from IPython.display import display
from IPython.display import Markdown
import textwrap
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
to_markdown(documents[1].page_content + "\n\nSource: " + documents[1].metadata["fname"])


> Contact us | SlackSkip to main content MenuSign inGet startedCreate workspaceGetting startedUsing SlackWorkspace administrationSlack status Contact usAlready use Slack? Sign in so we can tailor your support experience. If that’s not possible, we’d still like to hear from you.Your chat is in progressDon’t worry about taking notes – we’ll send you a copy of the chat at the end of the conversation.Thank you for contacting us!We’ve emailed you a copy of the chat. If you need to get back in touch, you can reply to the email and continue the conversation.VISIT THE HELP CENTRESEE YOUR SUPPORT HISTORYPrivacy policyYour email addressTopicChangeSelect a topic:Audio and videoBilling and subscriptionsConnection problemsManaging channelsManaging membersNotificationsSigning inSlack ConnectOr tell us what you need help with:Clear textRelated questionsRelated articlesCan you give us more details?TimezoneGET HELPPrivacy policyDiscover more.From channels to search, learn how Slack works from top to bottom.Visit the Help CentreChange regionSelecting a different region will change the language and content of slack.com.AmericasLatinoamérica (español)Brasil (português)United States (English)EuropeDeutschland (Deutsch)España (español)France (français)Italia (italiano)United Kingdom (English)Asia Pacific简体中文繁體中文India (English)日本 (日本語)대한민국 (한국어)Change regionProductProductWatch demoPricingPaid vs FreeAccessibilityFeatured releasesChange logStatusWhy Slack?Why Slack?Slack vs emailEnterpriseSmall businessProductivityTask managementScaleTrustFeaturesFeaturesChannelsSlack ConnectWorkflow BuilderMessagingHuddlesCanvasListsClipsSearchApps & integrationsFile sharingSlack AISecurityEnterprise Key ManagementSlack AtlasSee all featuresSolutionsSolutionsEngineeringITCustomer serviceSalesProject managementMarketingSecurityManufacture, auto and energyTechnologyMediaFinancial servicesRetailPublic sectorEducationHealth and life sciencesSee all solutionsResourcesResourcesHelp CentreWhat’s newResources librarySlack blogCommunityCustomer storiesEventsDevelopersPartnersPartner offersSlack MarketplaceSlack CertifiedCompanyCompanyAbout usNewsMedia kitBrand centreCareersSlack shopEngineering blogDesign blogContact usDownload SlackPrivacyTermsCookie preferencesYour privacy choices©2024 Slack Technologies, LLC, a Salesforce company. All rights reserved. Various trademarks held by their respective owners.
> 
> Source: slack.com_intl_en-gb_help_requests_new?aid=115002037526&from_hc=1&src=article

In [20]:
from langchain import PromptTemplate
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma
from typing import Optional


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [30]:
import getpass
import os
#AIzaSyCdeeO1ma2cEyvbK20F-6SMHTwzMnzU84U
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "AIzaSyCdeeO1ma2cEyvbK20F-6SMHTwzMnzU84U"

In [42]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
vector[:5]
#gemini_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")

[0.05168594419956207,
 -0.030764883384108543,
 -0.03062233328819275,
 -0.02802734263241291,
 0.01813093200325966]

In [33]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
)

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
# Save to disk
vectorstore = Chroma.from_documents(
    documents=documents,  # Data
    embedding=gemini_embeddings,  # Embedding model
    persist_directory="../chroma_db1",  # Directory to save data
)    

In [None]:
#vectorstore.get()


In [47]:
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/
#https://python.langchain.com/v0.2/api_reference/chroma/vectorstores/langchain_chroma.vectorstores.Chroma.html#langchain_chroma.vectorstores.Chroma.as_retriever
# Load from disk
vectorstore_disk = Chroma(
    persist_directory="../chroma_db1",  # Directory of db
    embedding_function=gemini_embeddings,  # Embedding model
)
# Get the Retriever interface for the store to use later.
# When an unstructured query is given to a retriever it will return documents.
# Read more about retrievers in the following link.
# https://python.langchain.com/docs/modules/data_connection/retrievers/
# https://python.langchain.com/docs/how_to/vectorstore_retriever/
retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 10})

  vectorstore_disk = Chroma(


In [48]:
retriever


VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x10c4649a0>, search_kwargs={'k': 10})

In [49]:
# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Keep the answer concise.\n
Add the metadata or source of the document where you get the answer. \n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template="You are an assistant for question-answering tasks.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nKeep the answer concise.\n\nAdd the metadata or source of the document where you get the answer. \n\nQuestion: {question} \nContext: {context} \nAnswer:"


In [50]:
# Combine data from documents to readable string format.
def format_docs(anonymized_docs):
    return "\n\n".join(doc.page_content for doc in anonymized_docs)


# Create stuff documents chain using LCEL.
#
# This is called a chain because you are chaining together different elements
# with the LLM. In the following example, to create the stuff chain, you will
# combine the relevant context from the website data matching the question, the
# LLM model, and the output parser together like a chain using LCEL.
#
# The chain implements the following pipeline:
# 1. Extract the data relevant to the question from the Chroma
#    vector store and save it to the variable `context`.
# 2. `RunnablePassthrough` option to provide `question` when invoking
#    the chain.
# 3. The `context` and `question` are then passed to the prompt where they
#    are populated in the respective variables.
# 4. This prompt is then passed to the LLM (`gemini-pro`).
# 5. Output from the LLM is passed through an output parser
#    to structure the model's response.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)
     

In [51]:
# This method is typically used to ask a question to the RAG (Retrieval-Augmented Generation) model containing Embeddings created with anonymized PII data.
# The RAG model is a type of question-answering model that retrieves relevant documents from a corpus and then generates an answer based on those documents.
# The result of this method call will be the answer to the question as generated by the RAG model before De-anonymization - This is not what we want.

rag_chain.invoke("What is Slack and how does it work?")

'Slack is a productivity platform that connects people, knowledge, and tools in a single place. It uses channels for team communication, offers searchable message history, integrates with various apps, and provides automation features like Workflow Builder, along with a paid AI add-on for enhanced search and knowledge access.\n\nSource: [https://slack.com/intl/en-gb/what-is-slack](https://slack.com/intl/en-gb/what-is-slack)\n'