<a href="https://colab.research.google.com/github/saincoder/xeven_AI/blob/master/RAG_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai langchain



***Import Libraries***

In [None]:
import os
import openai
import sys
import langchain

In [None]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

***Define chunk size***

In [None]:
chunk_size = 26
overlap = 4

***Splitters***

In [None]:
# RecursiveCharacterTextSplitter

recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    separators=["\n\n", "\n", " ", ""],
    chunk_overlap = overlap
)

In [None]:
# CharacterTextSplitter

character_splitter = CharacterTextSplitter(
    chunk_size = chunk_size,
    separator="\n\n",
    chunk_overlap = overlap
)

***Pdf loader and splitter***

In [None]:
!pip install langchain_community pypdf



In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("/content/National AI Policy Consultation Draft V1.pdf")
pages = loader.load_and_split()

In [None]:
len(pages)

48

In [None]:
chunk_list = []

for page in pages:
  chunks = recursive_splitter.split_text(page.page_content)
  for chunk in chunks:
    chunk_list.append(chunk)
print(len(chunk_list))

6068


In [28]:
chunk_list[6]

'Ministry of Information'

***Embedding***

In [29]:
!pip install transformers langchain_huggingface

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting sentence-transformers>=2.6.0 (from langchain_huggingface)
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading langchain_huggingface-0.1.0-py3-none-any.whl (20 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, langchain_huggingface
Successfully installed langchain_huggingface-0.1.0 sentence-transformers-3.1.0


In [30]:
from langchain_huggingface import HuggingFaceEmbeddings

In [31]:
embedding_model = HuggingFaceEmbeddings(model_name = "BAAI/bge-small-en-v1.5")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [33]:
em = embedding_model.embed_query(chunk_list[6])

In [34]:
len(em)

384

In [36]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.0-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.0-py3-none-any.whl (206 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.9/206.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.0


***Semantic Splitter***

In [38]:
from langchain_experimental.text_splitter import SemanticChunker

In [39]:
text_splitter = SemanticChunker(
    embedding_model, breakpoint_threshold_type="percentile"
)

In [58]:
text = """
LangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence. The project quickly garnered popularity,with improvements from hundreds of contributors on GitHub, trending discussions on Twitter, lively activity on the project's Discord server, many YouTube tutorials, and meetups in San Francisco and London. In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark.
"""

In [59]:
sementic_chunks = text_splitter.split_text(text)

In [60]:
len(sementic_chunks)

2

In [61]:
sementic_chunks[1]

'In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark. '

In [62]:
sementic_embedded = embedding_model.embed_query(sementic_chunks[1])

In [63]:
sementic_embedded

[-0.005689424928277731,
 -0.05107267573475838,
 -0.03656911104917526,
 -0.034872956573963165,
 0.013701609335839748,
 0.03620830923318863,
 -0.021605556830763817,
 -0.010672099888324738,
 -0.024081865325570107,
 -0.014485052786767483,
 0.04184197261929512,
 -0.008313529193401337,
 0.02375268004834652,
 0.005160125903785229,
 0.009059514850378036,
 0.0004216594388708472,
 -0.01481869351118803,
 -0.11312606930732727,
 0.009464450180530548,
 0.08164601773023605,
 0.005042689386755228,
 -0.03859242424368858,
 0.06224462762475014,
 -0.039803724735975266,
 0.0573345385491848,
 0.03177516162395477,
 0.007829270325601101,
 0.027559194713830948,
 0.003131481586024165,
 -0.11708537489175797,
 0.04535522684454918,
 -0.04184343293309212,
 0.07148673385381699,
 0.039082784205675125,
 -0.04085511341691017,
 0.0488431453704834,
 -0.049167387187480927,
 0.00043051940156146884,
 0.01583886705338955,
 0.06017820164561272,
 -0.01572207361459732,
 0.036649081856012344,
 -0.04416380077600479,
 0.0033368375