In [1]:
!pip install llama-index-embeddings-huggingface
!pip install chromadb
!pip install llama-index-vector-stores-chroma

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.3-py3-none-any.whl.metadata (767 bytes)
Collecting llama-index-core<0.13.0,>=0.12.0 (from llama-index-embeddings-huggingface)
  Downloading llama_index_core-0.12.34.post1-py3-none-any.whl.metadata (2.4 kB)
Collecting banks<3,>=2.0.0 (from llama-index-core<0.13.0,>=0.12.0->llama-index-embeddings-huggingface)
  Downloading banks-2.1.2-py3-none-any.whl.metadata (12 kB)
Collecting dirtyjson<2,>=1.0.8 (from llama-index-core<0.13.0,>=0.12.0->llama-index-embeddings-huggingface)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting filetype<2,>=1.2.0 (from llama-index-core<0.13.0,>=0.12.0->llama-index-embeddings-huggingface)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting griffe (from banks<3,>=2.0.0->llama-index-core<0.13.0,>=0.12.0->llama-index-embeddings-huggingface)
  Downloading griffe-1.7.3-py3-none-any.whl.metadata (5.0 kB)
Collecting nvi

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import Dict, List
import pandas as pd
from xml.etree import ElementTree as ET
import chromadb
from chromadb.utils import embedding_functions
import numpy as np
import os

class NomicEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self):
        self.model = HuggingFaceEmbedding(
            model_name="nomic-ai/nomic-embed-text-v2-moe",
            trust_remote_code=True
        )
    
    def __call__(self, texts: List[str]) -> List[List[float]]:
        prefixed_texts = [f"search_document: {text}" for text in texts]
        embeddings = self.model.get_text_embedding_batch(prefixed_texts)
        return embeddings

class StackExchangeVector:
    def __init__(self, collection_name: str = "bitcoin_stack_exchange"):
        self.collection_name = collection_name
        self.embedding_function = NomicEmbeddingFunction()
        self.client = chromadb.Client()
        
        # Try to get existing collection or create new one
        try:
            self.collection = self.client.get_collection(
                name=collection_name,
                embedding_function=self.embedding_function
            )
            print(f"Using existing collection '{collection_name}'")
        except:
            print(f"Creating new collection '{collection_name}'")
            self.collection = self.client.create_collection(
                name=collection_name,
                embedding_function=self.embedding_function
            )
    
    def add_documents(self, documents: List[Dict]):
        """Add documents to the vector store"""
        texts = []
        metadatas = []
        ids = []
        
        for i, doc in enumerate(documents):
            text = f"Title: {doc['title']}\n\nBody: {doc['body']}"
            
            metadata = {
                'post_id': str(doc['id']),
                'creation_date': str(doc['creation_date']),
                'tags': ','.join(doc['tags']),
                'score': str(doc['score']),
                'user_id': str(doc['user_id']),
                'post_type_id': str(doc['post_type_id'])
            }
            
            texts.append(text)
            metadatas.append(metadata)
            ids.append(f"doc_{doc['id']}")
            
            # Process in batches of 1000
            if len(texts) >= 1000:
                self.collection.add(
                    documents=texts,
                    metadatas=metadatas,
                    ids=ids
                )
                texts, metadatas, ids = [], [], []
        
        # Add remaining documents
        if texts:
            self.collection.add(
                documents=texts,
                metadatas=metadatas,
                ids=ids
            )

    def query(self, query_text: str, n_results: int = 5):
        """Query the vector store"""
        query_text = f"search_document: {query_text}"
        return self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )
    
    def get_collection_count(self):
        """Get number of documents in collection"""
        return self.collection.count()
    
    def delete_collection(self):
        try:
            count = self.collection.count()
        
            if count == 0:
                self.client.delete_collection(self.collection_name)
                print(f"Empty collection '{self.collection_name}' deleted")
            else:
                print(f"Collection '{self.collection_name}' contains {count} documents. Deletion skipped.")
            
        except Exception as e:
            print(f"Could not delete collection: {str(e)}")

def process_posts_xml(xml_path: str, vector_db: StackExchangeVector):
    """Process Posts.xml file and add to vector database"""
    print(f"Processing {xml_path}...")
    
    context = ET.iterparse(xml_path, events=('end',))
    batch = []
    
    for _, elem in context:
        if elem.tag == 'row':
            attrs = elem.attrib
            
            if 'Id' not in attrs or 'Body' not in attrs:
                continue
                
            doc = {
                'id': attrs['Id'],
                'title': attrs.get('Title', ''),
                'body': attrs['Body'],
                'creation_date': attrs.get('CreationDate', ''),
                'score': int(attrs.get('Score', 0)),
                'user_id': attrs.get('OwnerUserId', ''),
                'post_type_id': attrs.get('PostTypeId', ''),
                'tags': attrs.get('Tags', '').replace('><', ' ').strip('<>').split(),
                'answer_count': int(attrs.get('AnswerCount', 0)),
                'comment_count': int(attrs.get('CommentCount', 0))
            }
            
            batch.append(doc)
            
            if len(batch) >= 1000:
                vector_db.add_documents(batch)
                batch = []
                print(f"Processed {vector_db.get_collection_count()} documents...")
            
            elem.clear()
    
    if batch:
        vector_db.add_documents(batch)
    
    print(f"Finished processing. Total documents: {vector_db.get_collection_count()}")


def extract_links(text: str) -> List[str]:
    """Extract links from text using regex"""
    import re
    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
    return re.findall(url_pattern, text)

def format_post_content(text: str) -> tuple:
    """Format post content and extract links"""
    links = extract_links(text)
    
    cleaned_text = text.replace('\n', ' ').strip()
    
    return cleaned_text, links

2025-05-02 13:26:42.358255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746192402.584882      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746192402.650275      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def main():
    DATA_DIR = "/kaggle/input/bitcoin-stack" 
    
    vector_db = StackExchangeVector()
    
    if vector_db.get_collection_count() == 0:
        xml_files = {
            'posts': 'Posts.xml',
            'comments': 'Comments.xml',
            'users': 'Users.xml',
            'tags': 'Tags.xml',
            'badges': 'Badges.xml',
            'post_history': 'PostHistory.xml',
            'post_links': 'PostLinks.xml',
            'votes': 'Votes.xml'
        }
        
        for file_type, filename in xml_files.items():
            file_path = os.path.join(DATA_DIR, filename)
            if os.path.exists(file_path):
                print(f"Processing {filename}...")
                process_posts_xml(file_path, vector_db)
            else:
                print(f"Warning: {filename} not found in {DATA_DIR}")
    else:
        print("Using existing database, skipping processing...")
    
    print("\nBitcoin Stack Exchange Chatbot")
    print("================================")
    print("Ask any question about Bitcoin. Type 'quit' or 'exit' to end the conversation.")
    print("Type 'clear' to clear the screen.")
    
    while True:
        try:
            query = input("\nYou: ").strip()
            
            if query.lower() in ['quit', 'exit']:
                print("Goodbye!")
                break
            
            if query.lower() == 'clear':
                os.system('cls' if os.name == 'nt' else 'clear')
                continue
            
            if not query:
                continue
            
            print("\nSearching for relevant information...")
            results = vector_db.query(query)
            
            print("\nHere's what I found:")
            print("=====================")
            
            for i, doc in enumerate(results['documents'][0]):
                print(f"\nResult {i+1}:")
                print("-" * 80)

                content, links = format_post_content(doc)

                text = doc[:500] + ("..." if len(doc) > 500 else "")
                wrapped_text = '\n'.join([text[i:i+80] for i in range(0, len(text), 80)])
                print(wrapped_text)
                
                metadata = results['metadatas'][0][i]
                print("\nMetadata:")
                print(f"Post ID: {metadata['post_id']}")
                print(f"Score: {metadata['score']}")
                print(f"Tags: {metadata['tags']}")
                print(f"Created: {metadata['creation_date']}")
                if links:
                    print("\n References:")
                    for idx,link in enumerate(links,1):
                        print(f"{idx}.{link}")

                print("-"*80)
                

        
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"\nError: {str(e)}")
            print("Please try again.")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Fatal error: {str(e)}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.10k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/104k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Creating new collection 'bitcoin_stack_exchange'
Processing Posts.xml...
Processing /kaggle/input/bitcoin-stack/Posts.xml...
Processed 1000 documents...
Processed 2000 documents...
Processed 3000 documents...
Processed 4000 documents...
Processed 5000 documents...
Processed 6000 documents...
Processed 7000 documents...
Processed 8000 documents...
Processed 9000 documents...
Processed 10000 documents...
Processed 11000 documents...
Processed 12000 documents...
Processed 13000 documents...
Processed 14000 documents...
Processed 15000 documents...
Processed 16000 documents...
Processed 17000 documents...
Processed 18000 documents...
Processed 19000 documents...
Processed 20000 documents...
Processed 21000 documents...
Processed 22000 documents...
Processed 23000 documents...
Processed 24000 documents...
Processed 25000 documents...
Processed 26000 documents...
Processed 27000 documents...
Processed 28000 documents...
Processed 29000 documents...
Processed 30000 documents...
Processed 3100


You:  Can you tell me about bitcoin mining and open source mining softwares?



Searching for relevant information...

Here's what I found:

Result 1:
--------------------------------------------------------------------------------
Title: How to create your own bitcoin mining software?

Body: <p>Can we create m
ining software, if yes, how? </p>


Metadata:
Post ID: 61443
Score: 0
Tags: |mining-software|
Created: 2017-10-27T13:01:14.397
--------------------------------------------------------------------------------

Result 2:
--------------------------------------------------------------------------------
Title: Good Bitcoin Mining Software?

Body: <p>Does anyone know of any of the be
st Bitcoin Mining Software? I have been looking for mining software for a while,
 but I can't seem to find any. I'm using Windows 8 if that helps. </p>


Metadata:
Post ID: 41253
Score: 4
Tags: |mining-software|
Created: 2015-11-06T17:24:28.803
--------------------------------------------------------------------------------

Result 3:
------------------------------------------------


You:  Can you tell me about Segwit?



Searching for relevant information...

Here's what I found:

Result 1:
--------------------------------------------------------------------------------
Title: What is the difference between Segwit and non-segwit?

Body: <p>What's th
e difference? What is the definition of these two terms?</p>


Metadata:
Post ID: 66571
Score: 1
Tags: |segregated-witness|terminology|
Created: 2017-12-26T10:33:45.033
--------------------------------------------------------------------------------

Result 2:
--------------------------------------------------------------------------------
Title: Has SegWit been activated yet?

Body: <p>Is SegWit activated and when exa
ctly did that happen? I keep getting conflicting information, any help would be 
much appreciated!</p>


Metadata:
Post ID: 58144
Score: 1
Tags: |segregated-witness|
Created: 2017-08-16T07:49:31.740
--------------------------------------------------------------------------------

Result 3:
----------------------------------------------------


You:  Is there any discussion related to Bitcoin Improvement Proposals?



Searching for relevant information...

Here's what I found:

Result 1:
--------------------------------------------------------------------------------
Title: Improvements that could be made to how bitcoin propagates its transaction
s and blocks?

Body: <p>As of 2018 what are the known improvements that could be
 made to how bitcoin propagates its transactions and blocks?</p>


Metadata:
Post ID: 80927
Score: 10
Tags: |network|relay|transaction-propagation|compact-blocks|fibre|
Created: 2018-11-11T22:00:18.920
--------------------------------------------------------------------------------

Result 2:
--------------------------------------------------------------------------------
Title: Is there any consensus over specific improvements Bitcoin needs?

Body: <
p>There has been some criticism from cryptography experts about the current Bitc
oin algorithm(s). Now that Bitcoin has been released to the wild and everyone ha
s had time to study it, is there any consensus for improvements tha


You:  exit


Goodbye!
