In [1]:
# ZenML
from zenml import pipeline, step

# General
import os
import requests
from dotenv import load_dotenv

# MongoDB
from pymongo import MongoClient

# For data processing
import re

# For logging
from zenml.logger import get_logger

logger = get_logger(__name__)

# Load environment variables
env_path = os.path.abspath(os.path.join(os.getcwd(), '../dotenv.env'))
print(f"Looking for .env at: {env_path}")
loaded = load_dotenv(env_path)
print(f"Load successful: {loaded}")


Looking for .env at: /app/dotenv.env
Load successful: True


In [2]:
@step
def extract_github_data() -> list:
    import time
    logger.info("Starting data extraction from GitHub...")

    GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
    headers = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3+json'  # Explicitly specify API version
    }

    # List of repositories to scrape
    repositories = [
        {'owner': 'ros2', 'repo': 'ros2_documentation', 'branch': 'rolling'},
        {'owner': 'ros2', 'repo': 'examples', 'branch': 'rolling'},
        {'owner': 'ros2', 'repo': 'demos', 'branch': 'rolling'},
        {'owner': 'ros2', 'repo': 'rclpy', 'branch': 'rolling'},
        {'owner': 'ros2', 'repo': 'rclcpp', 'branch': 'rolling'},
        {'owner': 'ros-navigation', 'repo': 'docs.nav2.org', 'branch': 'master'},
        {'owner': 'ros-navigation', 'repo': 'navigation2', 'branch': 'main'},
        {'owner': 'moveit', 'repo': 'moveit2', 'branch': 'main'},
        {'owner': 'gazebosim', 'repo': 'gz-doc', 'branch': 'master'}
    ]

    data = []

    for repo in repositories:
        owner = repo['owner']
        repo_name = repo['repo']
        branch = repo['branch']
        logger.info(f"Fetching files from repository: {owner}/{repo_name}")

        # Get the list of files in the repository
        tree_url = f'https://api.github.com/repos/{owner}/{repo_name}/git/trees/{branch}?recursive=1'
        response = requests.get(tree_url, headers=headers)
        if response.status_code != 200:
            logger.error(f"Failed to fetch tree for {owner}/{repo_name}: {response.text}")
            continue

        tree = response.json().get('tree', [])
        file_urls = []
        for item in tree:
            if item['type'] == 'blob' and item['path'].endswith(('.md', '.rst', '.py')):
                raw_url = f'https://raw.githubusercontent.com/{owner}/{repo_name}/{branch}/{item["path"]}'
                file_urls.append({'url': raw_url, 'path': item['path']})

        logger.info(f"Found {len(file_urls)} files to download in {owner}/{repo_name}")

        # Fetch and store file contents
        for file_info in file_urls:
            file_url = file_info['url']
            file_path = file_info['path']
            try:
                file_response = requests.get(file_url, headers=headers)
                if file_response.status_code == 200:
                    content = file_response.text
                    data.append({
                        'url': file_url,
                        'path': file_path,
                        'content': content,
                        'source': 'github',
                        'repository': f'{owner}/{repo_name}',
                        'branch': branch
                    })
                    logger.debug(f"Fetched {file_url}")
                else:
                    logger.warning(f"Failed to fetch {file_url}: {file_response.status_code}")
            except Exception as e:
                logger.error(f"Error fetching {file_url}: {e}")

    logger.info(f"Extracted {len(data)} files from GitHub.")
    return data


In [3]:
@step
def transform_data(data: list) -> list:
    logger.info("Starting data transformation...")

    transformed_data = []

    def clean_text(text):
        if text.endswith('.py'):
            # For Python files, keep the content as is
            return text.strip()
        text = re.sub(r'!\[.*?\]\(.*?\)', '', text)  # Remove images
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)    # Remove links
        text = re.sub(r'#.*', '', text)  # Remove headings
        # Remove excessive whitespace and newlines
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    for item in data:
        content = clean_text(item['content'])
        if content:
            transformed_item = {
                'url': item['url'],
                'path': item['path'],
                'repository': item['repository'],
                'branch': item['branch'],
                'content': content,
                'source': item['source']
            }
            transformed_data.append(transformed_item)
            logger.debug(f"Transformed data from {item['url']}")
        else:
            logger.warning(f"No content after cleaning for {item['url']}")

    logger.info(f"Transformed {len(transformed_data)} items.")
    return transformed_data


In [4]:
@step
def extract_youtube_data() -> list:
    logger.info("Placeholder for YouTube data extraction.")
    # To be implemented later
    return []

@step
def extract_web_data() -> list:
    logger.info("Placeholder for web data extraction.")
    # To be implemented later
    return []


In [5]:
@step
def load_data(transformed_data: list):
    logger.info("Starting data loading into MongoDB...")

    client = MongoClient('mongodb://rag_mongodb:27017/')
    db = client['rag_db']
    collection = db['raw_data']

    # Insert data into MongoDB
    if transformed_data:
        collection.insert_many(transformed_data)
        logger.info(f"Inserted {len(transformed_data)} documents into MongoDB")
    else:
        logger.warning("No data to insert into MongoDB")


In [6]:
@pipeline
def data_collection_pipeline():
    # GitHub data
    github_data = extract_github_data()
    transformed_github_data = transform_data(github_data)
    load_data(transformed_github_data)

    # YouTube data (placeholder)
    youtube_data = extract_youtube_data()
    # Optionally process YouTube data

    # Web data (placeholder)
    web_data = extract_web_data()
    # Optionally process web data


In [7]:
 pipeline_instance = data_collection_pipeline()

[1;35mInitiating a new run for the pipeline: [0m[1;36mdata_collection_pipeline[1;35m.[0m
[1;35mInitializing the ZenML global configuration version to 0.71.0[0m
[1;35mCreating database tables[0m
[1;35mCreating default workspace 'default' ...[0m
[1;35mCreating default stack in workspace default...[0m
[1;35mSetting the global active workspace to 'default'.[0m
[33mSetting the global active stack to default.[0m
[33mThe current repo active workspace is no longer available. Resetting the active workspace to 'default'.[0m
[33mThe current repo active stack is no longer available. Resetting the active stack to default.[0m
[1;35mRegistered new pipeline: [0m[1;36mdata_collection_pipeline[1;35m.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35mYou can visualize your pipeline runs in the [0m[1;36mZenML Dash

In [8]:
from pymongo import MongoClient

client = MongoClient('mongodb://rag_mongodb:27017/')
db = client['rag_db']
collection = db['raw_data']

# Count the number of documents
doc_count = collection.count_documents({})
print(f"Total documents in raw_data collection: {doc_count}")

# Retrieve and print the URLs
documents = collection.find({})
print("Documents in MongoDB:")
for doc in documents:
    print(f"URL: {doc['url']}")
    print(f"Content snippet: {doc['content'][:100]}...")  # Print first 100 characters
    print("-" * 40)

Total documents in raw_data collection: 1206
Documents in MongoDB:
URL: https://raw.githubusercontent.com/ros2/ros2_documentation/rolling/README.md
Content snippet: This repository contains the sources for the ROS 2 documentation that is hosted at . The sources fro...
----------------------------------------
URL: https://raw.githubusercontent.com/ros2/ros2_documentation/rolling/conf.py
Content snippet: import itertools import os import sys import time from docutils.parsers.rst import Directive sys.pat...
----------------------------------------
URL: https://raw.githubusercontent.com/ros2/ros2_documentation/rolling/make_sitemapindex.py
Content snippet: from xml.etree.ElementTree import Element, SubElement, ElementTree from conf import distro_full_name...
----------------------------------------
URL: https://raw.githubusercontent.com/ros2/ros2_documentation/rolling/plugins/sphinx_sitemap_ros.py
Content snippet: ''' The implementation of this sphinx extensions is largely borrowed from the

In [9]:
urls = collection.distinct('url')
print("URLs ingested:")
for url in urls:
    print(url)

URLs ingested:
https://raw.githubusercontent.com/moveit/moveit2/main/.docker/README.md
https://raw.githubusercontent.com/moveit/moveit2/main/.github/ISSUE_TEMPLATE/first_timers_only.md
https://raw.githubusercontent.com/moveit/moveit2/main/.github/PULL_REQUEST_TEMPLATE.md
https://raw.githubusercontent.com/moveit/moveit2/main/CODE_OF_CONDUCT.md
https://raw.githubusercontent.com/moveit/moveit2/main/CONTRIBUTING.md
https://raw.githubusercontent.com/moveit/moveit2/main/MIGRATION.md
https://raw.githubusercontent.com/moveit/moveit2/main/README.md
https://raw.githubusercontent.com/moveit/moveit2/main/doc/MIGRATION_GUIDE.md
https://raw.githubusercontent.com/moveit/moveit2/main/moveit/CHANGELOG.rst
https://raw.githubusercontent.com/moveit/moveit2/main/moveit/scripts/README.md
https://raw.githubusercontent.com/moveit/moveit2/main/moveit/scripts/create_deprecated_headers.py
https://raw.githubusercontent.com/moveit/moveit2/main/moveit/scripts/create_maintainer_table.py
https://raw.githubusercontent