# Web Data Crawling 

In [1]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import time

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.hyperlinks = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

def get_hyperlinks(url):
    try:
        with urllib.request.urlopen(url) as response:
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            html = response.read().decode('utf-8')
    except Exception as e:
        print(f"Error getting hyperlinks: {str(e)}")
        return []

    parser = HyperlinkParser()
    parser.feed(html)
    return parser.hyperlinks

def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        if re.search(HTTP_URL_PATTERN, link):
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    return list(set(clean_links))

def crawl(url, max_pages=50, max_depth=3):
    local_domain = urlparse(url).netloc
    queue = deque([(url, 0)])  # (url, depth) pairs
    seen = set([url])
    processed_pages = 0

    # Create necessary directories
    os.makedirs("text", exist_ok=True)
    os.makedirs(f"text/{local_domain}", exist_ok=True)
    os.makedirs("processed", exist_ok=True)

    print(f"\nStarting crawl of {local_domain}")
    print(f"Maximum pages to crawl: {max_pages}")
    print(f"Maximum depth: {max_depth}\n")

    while queue and processed_pages < max_pages:
        url, depth = queue.pop()
        
        if depth >= max_depth:
            continue

        print(f"Crawling: {url}")
        print(f"Page {processed_pages + 1} of {max_pages} (Depth: {depth})")

        try:
            # Get the page content
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text()

            # Skip if JavaScript is required
            if "You need to enable JavaScript to run this app." in text:
                print(f"JavaScript required for: {url}")
                continue

            # Create filename from URL
            filename = url[8:].replace("/", "_")
            if len(filename) > 100:  # Limit filename length
                filename = filename[:100]

            # Save the content
            file_path = f'text/{local_domain}/{filename}.txt'
            with open(file_path, "w", encoding='utf-8') as f:
                f.write(text)

            processed_pages += 1
            print(f"Saved content to: {file_path}")

            # Check if we've reached the page limit
            if processed_pages >= max_pages:
                print(f"\nReached maximum pages limit ({max_pages})")
                break

            # Get new URLs to process
            new_links = get_domain_hyperlinks(local_domain, url)
            print(f"Found {len(new_links)} new links")

            # Add new URLs to the queue
            for link in new_links:
                if link not in seen:
                    seen.add(link)
                    queue.append((link, depth + 1))

            # Optional: Add a small delay to be nice to the server
            time.sleep(0.5)

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            continue

    print(f"\nCrawling completed:")
    print(f"Domain: {local_domain}")
    print(f"Total pages processed: {processed_pages}")
    print(f"Remaining URLs in queue: {len(queue)}")
    
    return local_domain


if __name__ == "__main__":
    # Example usage
    test_url = "https://slack.com/intl/en-in/help/articles/115004071768-What-is-Slack"
    domain = crawl(test_url, max_pages=50, max_depth=3)


Starting crawl of slack.com
Maximum pages to crawl: 50
Maximum depth: 3

Crawling: https://slack.com/intl/en-in/help/articles/115004071768-What-is-Slack
Page 1 of 50 (Depth: 0)
Saved content to: text/slack.com/slack.com_intl_en-in_help_articles_115004071768-What-is-Slack.txt
Found 131 new links
Crawling: https://slack.com/help/categories/200122103
Page 2 of 50 (Depth: 1)
Saved content to: text/slack.com/slack.com_help_categories_200122103.txt
Found 290 new links
Crawling: https://slack.com/intl/en-in/help/articles/360057638533-Understand-the-data-in-your-Slack-analytics-dashboard
Page 3 of 50 (Depth: 2)
Saved content to: text/slack.com/slack.com_intl_en-in_help_articles_360057638533-Understand-the-data-in-your-Slack-analytics-dashboar.txt
Found 123 new links
Crawling: https://slack.com/intl/en-in/help/articles/4408191650451-Slack-Connect--Provide-guidelines-for-members-of-your-organisation
Page 4 of 50 (Depth: 2)
Saved content to: text/slack.com/slack.com_intl_en-in_help_articles_4408

## Data Processing And Cleaning

In [2]:
import os
import pandas as pd
import csv

def remove_newlines(text):
    if text is None:
        return ""
    text = str(text)
    text = text.replace('\n', ' ')
    text = text.replace('\\n', ' ')
    return ' '.join(text.split())

def process_text_files(domain):
    os.makedirs(f"text/{domain}", exist_ok=True)
    os.makedirs("processed", exist_ok=True)
    
    texts = []
    text_dir = f"text/{domain}/"
    
    for file in os.listdir(text_dir):
        file_path = os.path.join(text_dir, file)
        if os.path.isfile(file_path) and not file.startswith('.'):
            try:
                with open(file_path, "r", encoding='utf-8') as f:
                    text = f.read()
                    processed_name = os.path.splitext(file)[0]
                    texts.append((processed_name, text))
            except Exception as e:
                print(f"Error processing file {file}: {str(e)}")
    
    df = pd.DataFrame(texts, columns=['fname', 'text'])
    df['text'] = df['text'].apply(remove_newlines)
    
    output_path = 'processed/scraped.csv'
    try:
        df.to_csv(output_path, 
                 index=False,
                 escapechar='\\',
                 doublequote=True,
                 encoding='utf-8',
                 quoting=csv.QUOTE_ALL)
        print(f"Processed {len(texts)} files and saved to {output_path}")
    except Exception as e:
        print(f"Error saving CSV: {str(e)}")
        df['text'] = df['text'].replace(r'[\x00-\x1F\x7F-\x9F]', '', regex=True)
        df.to_csv(output_path,
                 index=False,
                 encoding='utf-8',
                 escapechar='\\')
    
    return df

if __name__ == "__main__":
    # Example usage
    df = process_text_files(domain)

Processed 50 files and saved to processed/scraped.csv


In [12]:
df.head()

Unnamed: 0,fname,text
0,slack.com_intl_en-in_help_articles_202878523-T...,Try a paid Slack subscription for free | Slack...
1,slack.com_intl_en-in_help_articles_36003563517...,Deploy Slack for macOS | SlackSkip to main con...
2,slack.com_intl_en-in_help_articles_36004216657...,Allow an org domain with Enterprise mobility m...
3,slack.com_intl_en-in_help_articles_28932867593...,Manage list settings in Slack | SlackSkip to m...
4,slack.com_intl_en-in_help_articles_36003617735...,Add display name guidelines | SlackSkip to mai...


## Question Answering Agent Using Gemini, ChromaDB, Langchain

In [4]:
import os
from langchain_community.document_loaders import DataFrameLoader
#from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.docstore.document import Document
#from langchain import HarmBlockThreshold, HarmCategory
from langchain_community.vectorstores import Chroma
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
google_api_key = "AIzaSyCdeeO1ma2cEyvbK20F-6SMHTwzMnzU84U"
class QASystem:
    def __init__(self, google_api_key):
        os.environ["GOOGLE_API_KEY"] = google_api_key
        self.vectorstore = None
        self.rag_chain = None
    
    def initialize_vectorstore(self, df):
        try:
            loader = DataFrameLoader(df, page_content_column="text")
            documents = loader.load()
            
            gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
            llm = ChatGoogleGenerativeAI(
                model="gemini-1.5-pro",
                safety_settings={
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                }
            )
            
            self.vectorstore = Chroma.from_documents(
                documents=documents,
                embedding=gemini_embeddings,
                persist_directory="chroma_db"
            )
            
            retriever = self.vectorstore.as_retriever(search_kwargs={"k": 10})
            
            template = """You are an assistant for question-answering tasks.
            Use the following context to answer the question.
            If you don't know the answer, just say that you don't know.
            Keep the answer concise.
            Add the metadata or source of the document where you get the answer.
            
            Question: {question} 
            Context: {context} 
            Answer:"""
            
            prompt = PromptTemplate.from_template(template)
            
            def format_docs(docs):
                return "\n\n".join(doc.page_content for doc in docs)
            
            self.rag_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )
            
            return True
            
        except Exception as e:
            print(f"Error initializing vectorstore: {str(e)}")
            return False
    
    def get_answer(self, question):
        if self.rag_chain is None:
            return "System not initialized. Please initialize first."
        try:
            return self.rag_chain.invoke(question)
        except Exception as e:
            return f"Error generating answer: {str(e)}"

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
QASystem = QASystem("AIzaSyCdeeO1ma2cEyvbK20F-6SMHTwzMnzU84U")

In [9]:
QASystem.initialize_vectorstore(df)

True

In [11]:
QASystem.get_answer("what is slack?")

'Slack is the operating system for work. It brings together people, processes, data, agents, and AI into a single conversational interface, transforming how organizations achieve their goals.  It centralizes projects, information, and data in dedicated spaces called channels, allowing for secure collaboration and quicker completion of work. It also integrates with over 2,600 enterprise-ready apps and allows connections with internal tools.  Slack aims to be the primary location for work completion by integrating people, apps, and data. Source: [What is Slack? | Slack](https://slack.com/help/articles/115000171486-What-is-Slack-)\n'