In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
text = """ We propose using pre-trained computer vision models from Hugging Face to identify signs of diabetic retinopathy, cataracts, or keratopathy from retinal images. The output will be sent to a RAG (Retrieval-Augmented Generation) model trained on past diagnostic reports and relevant knowledge resources. LangChain will handle the Retrieval and Large Language Models like GPT-4 or Gemini will handle the Generator part. Visual aids like Grad-CAM highlight critical image regions, while models like BLIP can assist in generating context-aware image captions, improving clinical transparency and interpretability.
"""

In [None]:
# Text-based splitting
splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=20
)
res = splitter.split_text(text)
print(res)
print(len(res))

['We propose using pre-trained computer vision models from Hugging Face to identify signs of diabetic retinopathy, cataracts, or keratopathy from', 'or keratopathy from retinal images. The output will be sent to a RAG (Retrieval-Augmented Generation) model trained on past diagnostic reports and', 'reports and relevant knowledge resources. LangChain will handle the Retrieval and Large Language Models like GPT-4 or Gemini will handle the', 'will handle the Generator part. Visual aids like Grad-CAM highlight critical image regions, while models like BLIP can assist in generating', 'in generating context-aware image captions, improving clinical transparency and interpretability.']
5


In [15]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings


In [16]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [18]:
texts = ["LangChain is awesome!", "I love using Gemini for AI tasks."]

vectors = embedding_model.embed_documents(texts)
print(vectors[0])  


[0.03144112601876259, 0.02207336388528347, -0.06684137880802155, -0.027639253064990044, 0.04905693978071213, -0.0022192017640918493, 0.009112073108553886, -0.022648105397820473, 0.043565407395362854, 0.03172788769006729, -0.020840661600232124, 0.013432425446808338, -0.02880503609776497, -0.012558565475046635, 0.018212011083960533, -0.05374441668391228, 0.03609275072813034, 0.0031154772732406855, 0.012375977821648121, 0.03321681171655655, 0.003163316985592246, -0.001514298957772553, 0.026060765609145164, -0.0033286898396909237, 0.013443011790513992, -0.003552513662725687, 0.012667755596339703, -0.03934071585536003, -0.058355577290058136, -0.007579524535685778, -0.015159137547016144, 0.01060421485453844, -0.05965876579284668, 0.014186260290443897, 0.018498633056879044, -0.02068965882062912, -0.001821015728637576, 0.020443158224225044, -0.003624444594606757, -0.04729121923446655, 0.03617805242538452, -0.06587669253349304, -0.03201816976070404, -0.0028463415801525116, -0.02025587297976017,

In [None]:
# Length-based Splitting

from langchain.text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=0,
    separator=""
)
res = splitter.split_text(text)
print(res)

# For RAG-based models, chunk-overlap should be around 10-20% of the chunk size

['We propose using pre-trained computer vision mode', 'ls from Hugging Face to identify signs of diabetic', 'retinopathy, cataracts, or keratopathy from retin', 'al images. The output will be sent to a RAG (Retri', 'eval-Augmented Generation) model trained on past d', 'iagnostic reports and relevant knowledge resources', '. LangChain will handle the Retrieval and Large La', 'nguage Models like GPT-4 or Gemini will handle the', 'Generator part. Visual aids like Grad-CAM highlig', 'ht critical image regions, while models like BLIP', 'can assist in generating context-aware image capti', 'ons, improving clinical transparency and interpret', 'ability.']
