## Setup and Import Libraries

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import TextLoader
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")

## Loading Document

In [4]:
loader = TextLoader(file_path="data.txt")
text = loader.load()
text

[Document(metadata={'source': 'data.txt'}, page_content='LangChain is a framework for building applications with LLMs.\nLangChain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.\nYou can create chains, agents, memory and retrievers.\nThe Eiffel Tower is located in Paris.\nFrance is a popular tourist destination')]

## 1. Split into Sentences

In [5]:
text = text[0].page_content

sentences = [sentence.strip() for sentence in text.split("\n") if sentence.strip()]
sentences

['LangChain is a framework for building applications with LLMs.',
 'LangChain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.',
 'You can create chains, agents, memory and retrievers.',
 'The Eiffel Tower is located in Paris.',
 'France is a popular tourist destination']

## 2. Embed Each Sentence

In [6]:
embeddings = model.encode(sentences)

## 3. Initialize Parameters

In [7]:
threshold = 0.7
chunks = []

current_chunk = [sentences[0]]

## 4. Semantic Grouping Based on Threshold

In [8]:
for sentence in range(1, len(sentences)):
    similarity = cosine_similarity(
        X=[embeddings[sentence - 1]],
        Y=[embeddings[sentence]]
    )[0][0]

    if similarity >= threshold:
        current_chunk.append(sentences[sentence])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[sentence]]

chunks.append(" ".join(current_chunk))

print("\n Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"Chunk {idx+1}: \n{chunk}")


 Semantic Chunks:
Chunk 1: 
LangChain is a framework for building applications with LLMs. LangChain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.
Chunk 2: 
You can create chains, agents, memory and retrievers.
Chunk 3: 
The Eiffel Tower is located in Paris.
Chunk 4: 
France is a popular tourist destination
