In [5]:
text = """Cargo is Rust’s build system and package manager. Most Rustaceans use this tool to manage their Rust projects because Cargo handles a lot of tasks for you, such as building your code, downloading the libraries your code depends on, and building those libraries. (We call the libraries that your code needs dependencies.)\n\nThe simplest Rust programs, like the one we’ve written so far, don’t have any dependencies. If we had built the “Hello, world!” project with Cargo, it would only use the part of Cargo that handles building your code. As you write more complex Rust programs, you’ll add dependencies, and if you start a project using Cargo, adding dependencies will be much easier to do.\n\nBecause the vast majority of Rust projects use Cargo, the rest of this book assumes that you’re using Cargo too. Cargo comes installed with Rust if you used the official installers discussed in the “Installation” section. If you installed Rust through some other means, check whether Cargo is installed by entering the following in your terminal:"""

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

import time

from pathlib import Path
from typing import List

In [18]:
hf_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

In [22]:
from langchain_text_splitters import SentenceTransformersTokenTextSplitter

splitter = SentenceTransformersTokenTextSplitter.from_huggingface_tokenizer(tokenizer=hf_tokenizer,chunk_overlap=50, tokens_per_chunk=200)

In [24]:
tokens = splitter.count_tokens(text="whats up ?")
print(tokens)

6


In [14]:
chunks = splitter.split_text(text)
print(len(chunks))


2


In [None]:
def txt_chunker (txt, hf_tokenizer, chunk_size: int, chunk_overlap: int):
    """makes chunks from text"""
    
    splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer),
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
    
    chunks = splitter.split_text(txt)
    
    return chunks

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

def embed_query(text: str):
    
    embedded_query = embedding_model.embed_query(text)
    dimension = len(embedded_query)
    
    return embedded_query, dimension

In [30]:
embedded_query, dimension = embed_query("what is it really like ?")

In [31]:
dimension

768