In [8]:
!pip install transformers sentencepiece langchain_community

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain_community)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain_community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.38-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.11.13-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain_community)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)


In [12]:
#!/usr/bin/env python3
import os
import sys

# For DeepSeek, we use Hugging Face's AutoTokenizer.
from transformers import AutoTokenizer

def count_deepseek_tokens(text):
    # Initialize tokenizer for DeepSeek-R1 (adjust the model name if needed)
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
    tokens = tokenizer.encode(text)
    return len(tokens)

# For Ollama, we use the LangChain community wrapper.
try:
    from langchain_community.llms.ollama import Ollama
except ImportError:
    print("Please install langchain_community (e.g. pip install langchain_community) to use Ollama functions.")
    sys.exit(1)

def count_ollama_tokens(text):
    # Instantiate the Ollama LLM with your preferred model and base_url.
    # Make sure Ollama is running locally (default port: 11434).
    ollama_llm = Ollama(model="llama3.2", base_url="http://localhost:11434")
    return ollama_llm.get_num_tokens(text)

def process_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"Skipping {file_path}: {e}")
        return None

    text_length = len(text)
    deepseek_count = count_deepseek_tokens(text)
    ollama_count = count_ollama_tokens(text)
    return file_path, text_length, deepseek_count, ollama_count

def main():

    root_dir = "/Users/reva/Documents/geek_projects/dungbeetle"

    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            result = process_file(file_path)
            if result:
                file_path, text_length, deepseek_count, ollama_count = result
                print(f"File: {file_path}")
                print(f"Text length: {text_length} characters")
                print(f"DeepSeek token count: {deepseek_count}")
                print(f"Ollama token count: {ollama_count}")
                print("-" * 40)

if __name__ == "__main__":
    main()

File: /Users/reva/Documents/geek_projects/dungbeetle/go.mod
Text length: 2176 characters
DeepSeek token count: 868
Ollama token count: 1046
----------------------------------------
File: /Users/reva/Documents/geek_projects/dungbeetle/LICENSE
Text length: 1110 characters
DeepSeek token count: 234
Ollama token count: 286
----------------------------------------
File: /Users/reva/Documents/geek_projects/dungbeetle/Dockerfile
Text length: 494 characters
DeepSeek token count: 165
Ollama token count: 212
----------------------------------------
File: /Users/reva/Documents/geek_projects/dungbeetle/Makefile
Text length: 968 characters
DeepSeek token count: 306
Ollama token count: 388
----------------------------------------
File: /Users/reva/Documents/geek_projects/dungbeetle/config.toml.sample
Text length: 1126 characters
DeepSeek token count: 388
Ollama token count: 475
----------------------------------------
File: /Users/reva/Documents/geek_projects/dungbeetle/go.sum
Text length: 12062 cha