# Basic Crawler

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import time
from queue import Queue
import random
import aiohttp
import asyncio
import logging
from requests.exceptions import RequestException
import markdownify
from urllib.parse import urljoin
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


class WebCrawler:
    def __init__(self, max_depth=2, delay=2):
        self.current_url = None
        self.max_depth = max_depth
        self.delay = delay
        self.visited = set()
        self.queue = Queue()
        self.rate_limit = 10  # Requests per second
        self.last_request_time = time.time()
        self.base_dir = "/home/ubuntu/open-llm/crawler_dump"

    def wait_for_rate_limit(self):
        current_time = time.time()
        elapsed_time = current_time - self.last_request_time
        if elapsed_time < 1 / self.rate_limit:
            sleep_time = 1 / self.rate_limit - elapsed_time
            time.sleep(sleep_time)
        self.last_request_time = current_time

    def is_allowed(self, url):
        robots_txt = requests.get(f"{url}/robots.txt").text
        allowed_domains = []
        for line in robots_txt.split('\n'):
            if line.startswith("Allow:") or line.startswith("User-agent: *"):
                allowed_domains.append(line.split()[1])
        
        return '*' in allowed_domains or any(domain in allowed_domains for domain in self.visited)

    def wait_between_requests(self):
        """Random delay between requests"""
        time.sleep(random.uniform(1, 5))

    def convert_html_to_markdown(self, html_content):
        """Convert HTML content to markdown"""
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text
        text = soup.get_text()
        
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        # Convert to markdown
        converter = markdownify.MarkdownConverter()
        markdown_text = converter.convert(text)
        
        return markdown_text

    def save_page(self, url, content):
        """Save the page content as markdown"""
        dir_path = os.path.join(self.base_dir, *url.split('/')[2:]).replace(".html","")
        
        # Create directories if they don't exist
        os.makedirs(dir_path, exist_ok=True)
        
        # Save the markdown file
        md_file = os.path.join(dir_path, f"{os.path.basename(url)}.md")
        with open(md_file, "w", encoding="utf-8") as f:
            f.write(self.convert_html_to_markdown(content))

    def save_page_with_images(self, url, content):
        soup = BeautifulSoup(content, 'html.parser')
        
        # Save the HTML file
        self.save_page(url, content)
        
        # Find all images
        images = soup.find_all('img')
        for img in images:
            src = img.get('src')
            if src and src.startswith(('http://', 'https://')):
                image_url = src
            elif src.startswith('/'):
                # Assume relative path, join with base URL
                image_url = urljoin(url, src)
            else:
                continue
            
            # Extract filename from URL
            filename = os.path.basename(image_url)
            
            # Save the image
            image_dir = os.path.join(self.base_dir, *image_url.split('/')[2:])
            os.makedirs(image_dir, exist_ok=True)
            image_path = os.path.join(image_dir, filename)
            response = requests.get(image_url)
            with open(image_path, 'wb') as f:
                f.write(response.content)


    def crawl(self, url):
        try:
        #     if not self.is_allowed(url):
        #         logging.warning(f"Not allowed to crawl: {url}")
        #         return
            if url not in self.visited and url != self.current_url:
                self.visited.add(url)
                self.current_url = url
                logging.info(f"Crawling: {url}")
                
                response = requests.get(url, timeout=10, allow_redirects=True)
                response.raise_for_status()  # Raise an exception for bad status codes
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                links = soup.find_all('a')
                for link in links:
                    href = link.get('href')
                    # Convert relative URLs to absolute URLs
                    if not href.startswith(('http://', 'https://')):
                        href = urljoin(url, href)

                    # logging.info(f"HREF:{href}")
                    
                    # Add url to Queue
                    self.queue.put(href)
                    self.save_page(url, response.text)
                
                self.wait_between_requests()
                
                while not self.queue.empty() and self.max_depth > 0:
                    new_url = self.queue.get()
                    self.crawl(new_url)
                    self.max_depth -= 1
            
                # Save the current page
                
            
        except RequestException as e:
            logging.error(f"Error fetching {url}: {e}")
        except Exception as e:
            logging.exception(f"Unexpected error while crawling {url}")

    
    async def crawl_async(self, url):
        if not self.is_allowed(url):
            print(f"Not allowed to crawl: {url}")
            return
        
        if url not in self.visited:
            self.visited.add(url)
            print(f"Crawling: {url}")
            
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(url, timeout=10) as response:
                        response.raise_for_status()
                        
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')
                        
                        links = soup.find_all('a')
                        for link in links:
                            href = link.get('href')
                            if href and href.startswith('http'):
                                self.queue.put(href)
                    
                    await self.wait_between_requests()
                    
                    if self.queue.qsize() > 0 and self.max_depth > 0:
                        url_to_crawl = self.queue.get()
                        await self.crawl_async(url_to_crawl)
                        self.max_depth -= 1
                
                except aiohttp.ClientError as e:
                    print(f"Error fetching {url}: {e}")
                
                except Exception as e:
                    print(f"Unexpected error while crawling {url}: {e}")

    async def start_crawling(self, initial_url):
        self.queue.put(initial_url)
        await self.crawl_async(initial_url)


# Usage
crawler = WebCrawler(max_depth=4)
crawler.crawl('https://raag-hindustani.com/')


2024-09-27 10:34:08,594 - INFO - Crawling: https://raag-hindustani.com/
2024-09-27 10:34:11,246 - INFO - Crawling: https://raag-hindustani.com/Introduction.html
2024-09-27 10:34:16,506 - INFO - Crawling: https://raag-hindustani.com/Notes.html
2024-09-27 10:34:19,094 - INFO - Crawling: https://raag-hindustani.com/Scales1.html
2024-09-27 10:34:23,488 - INFO - Crawling: https://raag-hindustani.com/Scales2.html
2024-09-27 10:34:27,144 - INFO - Crawling: https://raag-hindustani.com/Scales3.html
2024-09-27 10:34:29,827 - INFO - Crawling: https://raag-hindustani.com/Scales4.html
2024-09-27 10:34:34,076 - INFO - Crawling: https://raag-hindustani.com/Scales5.html
2024-09-27 10:34:38,262 - INFO - Crawling: https://raag-hindustani.com/Rhythm.html
2024-09-27 10:34:41,569 - INFO - Crawling: https://raag-hindustani.com/Embellishment.html
2024-09-27 10:34:44,375 - INFO - Crawling: https://raag-hindustani.com/Notation.html
2024-09-27 10:34:47,235 - INFO - Crawling: https://raag-hindustani.com/SimpleSo

In [22]:
import os
import shutil

def copy_md_files(source_folder, target_folder):
    # Ensure the target folder exists
    os.makedirs(target_folder, exist_ok=True)

    # Iterate through all files in the source folder and its subfolders
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.endswith('.md'):  # Check if it's an .md file
                source_path = os.path.join(root, file)
                target_path = os.path.join(target_folder, file)
                
                # Copy the .md file to the target folder
                shutil.copy2(source_path, target_path)
                print(f"Copied: {source_path} -> {target_path}")
copy_md_files(source_folder="/home/ubuntu/open-llm/crawler_dump/raag-hindustani.com",target_folder="/home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed")

Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/.md
Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/Scales3/Scales3.html.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/Scales3.html.md
Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/LearningTools/LearningTools.html.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/LearningTools.html.md
Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/Scales4/Scales4.html.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/Scales4.html.md
Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/Improvisation/Improvisation.html.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/Improvisation.html.md
Copied: /home/ubuntu/open-llm/crawler_dump/raag-hindustani.com/Scales2/Scales2.html.md -> /home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed/Scales2.html.md
Copied: /hom

# RAG

In [43]:
import boto3
import mysql.connector
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_community.chat_models.bedrock import BedrockChat
from dotenv import find_dotenv, load_dotenv
import os
load_dotenv(find_dotenv())

genrative_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
embedding_model_id = "amazon.titan-embed-text-v1"

CONNECT_TIMEOUT = 1000
READ_TIMEOUT = 1000
BOTO_SERVICE = "bedrock-runtime"
REGION = "us-east-1"
ENDPOINT_URL = "https://bedrock-runtime.us-east-1.amazonaws.com"


boto3_client = boto3.client(
    BOTO_SERVICE,
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name=os.getenv("REGION"),
    endpoint_url=ENDPOINT_URL,
)


llm_genrative = BedrockChat(
    model_id=genrative_model_id,
    client=boto3_client,
)

llm_embedding = BedrockEmbeddings(
    model_id=embedding_model_id, client=boto3_client
)


In [24]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
import os
import nltk
nltk.download('punkt_tab')
markdown_dir = "/home/ubuntu/open-llm/crawler_dump/raag-hindustani-parsed"
docs = []
for file in os.listdir(markdown_dir):
    loader = UnstructuredMarkdownLoader(os.path.join(markdown_dir,file),mode="elements")
    data = loader.load()
    docs.append(data)


[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


'NarrativeText'

In [42]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    chunk_size=512,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = [doc[0].page_content for doc in docs]
splits = text_splitter.create_documents(texts)
print(splits[27].page_content)

Khatka
Andolan
An andolan is a slow oscillation applied to a note. It usually features in ragas that use microtones and is applied to the notes in those ragas that involve the use of microtones. Microtones are unstable pitches that fall between two notes and are difficult to sustain. Artists can use this natural instability to their advantage by mastering the andolan and oscillating the note in a controlled fashion, rather like a graceful tight-rope walker.
Andolan


In [46]:
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, llm_embedding)

In [50]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm_genrative
    | StrOutputParser()
)

rag_chain.invoke("What are the notes of raga yaman?")




'Based on the context provided, the notes of raga Yaman are not explicitly stated. However, we can infer that Yaman uses the sharp variant of the fourth note (Ma or M) in the Indian classical music scale. The context mentions that Yaman belongs to the Kalyan scale and uses a key signature with the G-major scale when notating compositions in this raga. But it is important to note that Yaman is not the same as the G-major scale, as it begins on the note C and incorporates the sharpened fourth note.'