In [2]:
import requests
from bs4 import BeautifulSoup
import os

def scrape_azure_documentation(base_url, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the links to documentation pages
    links = soup.find_all('a', href=True)

    for link in links:
        href = link['href']
        if href.startswith('/en-us/azure/'):
            full_url = f"https://docs.microsoft.com{href}"
            print(full_url)
            scrape_documentation_page(full_url, output_dir)

def scrape_documentation_page(url, output_dir):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the page title
    title = soup.find('h1').get_text(strip=True)
    filename = f"{title}.txt"
    filepath = os.path.join(output_dir, filename)

    # Extract the main content of the page
    content = soup.find('main')

    with open(filepath, 'w', encoding='utf-8') as file:
        if content:
            file.write(content.get_text(separator='\n', strip=True))
        else:
            file.write("No content found")

    print(f"Scraped {title}")

if __name__ == "__main__":
    base_url = "https://docs.microsoft.com/en-us/azure/"
    output_dir = "azure_docs"
    scrape_azure_documentation(base_url, output_dir)


https://docs.microsoft.com/en-us/azure/developer/
Scraped Azure developer documentation
https://docs.microsoft.com/en-us/azure/architecture/
Scraped Azure Architecture Center
https://docs.microsoft.com/en-us/azure/cloud-adoption-framework/
Scraped Microsoft Cloud Adoption Framework for Azure
https://docs.microsoft.com/en-us/azure/bot-service/
Scraped Azure AI Bot Service documentation
https://docs.microsoft.com/en-us/azure/bot-service/
Scraped Azure AI Bot Service documentation
https://docs.microsoft.com/en-us/azure/azure-video-indexer/
Scraped Learn about Azure AI Video Indexer
https://docs.microsoft.com/en-us/azure/azure-video-indexer/
Scraped Learn about Azure AI Video Indexer
https://docs.microsoft.com/en-us/azure/devops/artifacts/
Scraped Azure Artifacts documentation
https://docs.microsoft.com/en-us/azure/devops/artifacts/
Scraped Azure Artifacts documentation
https://docs.microsoft.com/en-us/azure/devops/boards/
Scraped Azure Boards documentation
https://docs.microsoft.com/en-us

In [3]:
!git clone https://github.com/MicrosoftDocs/azure-docs.git

Cloning into 'azure-docs'...
remote: Enumerating objects: 7448541, done.[K
remote: Counting objects: 100% (5592/5592), done.[K
remote: Compressing objects: 100% (1908/1908), done.[K
remote: Total 7448541 (delta 3697), reused 5427 (delta 3639), pack-reused 7442949[K
Receiving objects: 100% (7448541/7448541), 22.78 GiB | 20.10 MiB/s, done.
Resolving deltas: 100% (5776890/5776890), done.
Updating files: 100% (82435/82435), done.


In [5]:
import json
import datetime
import time
import urllib 

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
import pymongo

from openai import AzureOpenAI
from dotenv import load_dotenv
import os

In [6]:
load_dotenv()
COSMOS_MONGO_USER = os.getenv("COSMOS_MONGO_USER")
COSMOS_MONGO_PWD = os.getenv("COSMOS_MONGO_PWD")
COSMOS_MONGO_SERVER = os.getenv("COSMOS_MONGO_SERVER")

AOAI_client = AzureOpenAI(api_key=os.getenv("AZURE_OPENAI_KEY"), azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), api_version="2024-02-01",)

In [9]:
# import os
import time
# import openai
import pymongo
from bson import ObjectId

# Initialize Azure OpenAI client
# openai.api_key = 'your_openai_api_key'

# Initialize MongoDB client
mongo_conn = "mongodb+srv://"+urllib.parse.quote(COSMOS_MONGO_USER)+":"+urllib.parse.quote(COSMOS_MONGO_PWD)+"@"+COSMOS_MONGO_SERVER
client = pymongo.MongoClient(mongo_conn)

db = client['azure']
collection = db['azure_collection']

def generate_embeddings(text):
    '''
    Generate embeddings from a string of text.
    This will be used to vectorize data and user input for interactions with Azure OpenAI.
    '''

#     response = AOAI_client.embeddings.create(
#     input = "The name of the person is surya. he has various skills in azure, openai, machine learning.",
#     model= "text-embedding-ada-002"
# )
    response = AOAI_client.embeddings.create(input=text, model="text-embedding-ada-002")
    response = response.model_dump()
    embeddings = response['data'][0]['embedding']
    time.sleep(0.5)  # To avoid hitting rate limits
    return embeddings

def process_file(file_path, doc_id):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Optional: Split content into chunks if it's too large
    chunks = [content[i:i + 4000] for i in range(0, len(content), 4000)]
    
    for idx, chunk in enumerate(chunks):
        embedding = generate_embeddings(chunk)
        document = {
            "id": str(ObjectId()),  # Unique ID for MongoDB document
            "title": os.path.basename(file_path),
            "content": chunk,
            "c_vector": embedding,
            "doc_id": doc_id,
            "chunk_id": idx
        }
        collection.insert_one(document)
        print(f"Inserted chunk {idx} of {file_path}")

def process_directory(directory_path):
    doc_id = 0  # Unique document ID for tracking chunks of the same file
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.md'):  # Adjust the file type if needed
                file_path = os.path.join(root, file)
                process_file(file_path, doc_id)
                doc_id += 1

if __name__ == "__main__":
    articles_dir = "./azure-docs/articles"
    process_directory(articles_dir)


  client = pymongo.MongoClient(mongo_conn)


Inserted chunk 0 of ./azure-docs/articles/azure-glossary-cloud-terminology.md
Inserted chunk 1 of ./azure-docs/articles/azure-glossary-cloud-terminology.md
Inserted chunk 2 of ./azure-docs/articles/azure-glossary-cloud-terminology.md
Inserted chunk 3 of ./azure-docs/articles/azure-glossary-cloud-terminology.md
Inserted chunk 0 of ./azure-docs/articles/third-party-notices.md
Inserted chunk 0 of ./azure-docs/articles/defender-for-iot/organizations/set-up-sso.md
Inserted chunk 1 of ./azure-docs/articles/defender-for-iot/organizations/set-up-sso.md
Inserted chunk 0 of ./azure-docs/articles/defender-for-iot/organizations/eiot-defender-for-endpoint.md
Inserted chunk 1 of ./azure-docs/articles/defender-for-iot/organizations/eiot-defender-for-endpoint.md
Inserted chunk 0 of ./azure-docs/articles/defender-for-iot/organizations/release-notes-archive.md
Inserted chunk 1 of ./azure-docs/articles/defender-for-iot/organizations/release-notes-archive.md
Inserted chunk 2 of ./azure-docs/articles/defen

RateLimitError: Error code: 429 - {'error': {'code': 429, 'message': 'The event daily request rate of 5000 calls to has been exceeded. Requests are disabled until UTC midnight.'}}

In [11]:
db.command({
  'createIndexes': 'azure_collection',
  'indexes': [
    {
      'name': 'VectorSearchIndex',
      'key': {
        "c_vector": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-ivf',
        'numLists': 1,
        'similarity': 'COS',
        'dimensions': 1536
      }
    }
  ]
})

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}