# LangChain

#### Document Loaders

In [10]:
from langchain_community.document_loaders import TextLoader

# Create TextLoader Object
loader = TextLoader("Data/speech.txt")

# Get file Contents
speech_doc = loader.load()

speech_doc[0]

Document(metadata={'source': 'Data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairnes

In [11]:
from langchain_community.document_loaders import PyPDFLoader

pdf_loader = PyPDFLoader("Data/attention.pdf")
pdf_loader = pdf_loader.load()
pdf_loader

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'Data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlu

In [12]:
from langchain_community.document_loaders import WebBaseLoader
import bs4
wb_loader = WebBaseLoader(web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
                          bs_kwargs=dict(parse_only = bs4.SoupStrainer(
                              class_ = ("post-title","post-content","post-header")
                          )),
                          )
wb_loader.load()

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistake

In [13]:
from langchain_community.document_loaders import ArxivLoader
ax_loader = ArxivLoader(query="2304.10557",load_max_docs = 2).load()
ax_loader

[Document(metadata={'Published': '2024-02-08', 'Title': 'An Introduction to Transformers', 'Authors': 'Richard E. Turner', 'Summary': 'The transformer is a neural network component that can be used to learn\nuseful representations of sequences or sets of data-points. The transformer has\ndriven recent advances in natural language processing, computer vision, and\nspatio-temporal modelling. There are many introductions to transformers, but\nmost do not contain precise mathematical descriptions of the architecture and\nthe intuitions behind the design choices are often also missing. Moreover, as\nresearch takes a winding path, the explanations for the components of the\ntransformer can be idiosyncratic. In this note we aim for a mathematically\nprecise, intuitive, and clean description of the transformer architecture. We\nwill not discuss training as this is rather standard. We assume that the reader\nis familiar with fundamental topics in machine learning including multi-layer\nperceptr

In [14]:
from langchain_community.document_loaders import WikipediaLoader
wp_loader = WikipediaLoader(query="Chelsea FC", load_max_docs=2).load()
wp_loader

[Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won 

# Text Splitters

In [15]:
# Recursive Split Text by Characters
from langchain_text_splitters import RecursiveCharacterTextSplitter
char_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=50)
final_chunked_documents = char_splitter.split_documents(pdf_loader)
final_chunked_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'Data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu'),
 Document(metadata={'producer'

In [16]:
final_chunked_documents[1]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'Data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='University of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,')

In [20]:
with open('Data/speech.txt') as file:
    speech_txt = file.readlines()

from langchain_text_splitters import CharacterTextSplitter
char_text_splitter = CharacterTextSplitter(chunk_size = 100, chunk_overlap = 20) #By default splits on '\n\n'
speech_doc = char_text_splitter.create_documents(speech_txt)
char_splitter.split_documents(speech_doc)

[Document(metadata={}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.'),
 Document(metadata={}, page_content='…'),
 Document(metadata={}, page_content='It will be all the eas

In [29]:
from langchain_text_splitters import HTMLHeaderTextSplitter
headers_to_split_on  = [('h1','H1'), ('h2','H2'),('h3','H3')]
html_head_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_splits = html_head_splitter.split_text_from_url("https://longform.org/random")
html_splits

[Document(metadata={'H1': 'Longfor'}, page_content='Longfor'),
 Document(metadata={'H1': 'Longfor'}, page_content='m  \nMenu  \nPodcast  \nBest Of  \n2021  \n2020  \n2019  \n2018  \n2017  \n2016  \n2015  \n2014  \n2013  \n2012'),
 Document(metadata={'H1': 'Longfor', 'H2': 'Archive'}, page_content='Archive'),
 Document(metadata={'H1': 'Longfor', 'H2': 'Archive'}, page_content='Sections  \nPublications  \nWriters  \nTags  \nRandom Article'),
 Document(metadata={'H1': 'Longfor', 'H2': 'Contact'}, page_content='Contact'),
 Document(metadata={'H1': 'Longfor', 'H2': 'Contact'}, page_content='podcast@longform.org'),
 Document(metadata={'H1': 'Random Article'}, page_content='Random Article'),
 Document(metadata={}, page_content='History  \nWorld  \nThe Couple Who Saved China\'s Ancient Architectural Treasures Before They Were Lost Forever  \nHow a poet and an architect rescued a nation’s riches.  \nTony Perrottet  \nSmithsonian  \nJan 2017  \n25  \nmin  \nPermalink  \nGive Me Another  \nvar _g

In [32]:
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

from langchain_text_splitters import RecursiveJsonSplitter
json_splitter = RecursiveJsonSplitter(max_chunk_size = 300)
splitted_json = json_splitter.split_json(json_data)
splitted_json

[{'openapi': '3.1.0',
  'info': {'title': 'LangSmith', 'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions'],
     'summary': 'Get Tracing Project Prebuilt Dashboard',
     'description': 'Get a prebuilt dashboard for a tracing project.'}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        'title': 'Accept'}}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashb

In [37]:
json_doc = json_splitter.create_documents([json_data])
json_text = json_splitter.split_text(json_data)
json_text

['{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"tags": ["tracer-sessions"], "summary": "Get Tracing Project Prebuilt Dashboard", "description": "Get a prebuilt dashboard for a tracing project."}}}}',
 '{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"operationId": "get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}',
 '{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}',
 '{"paths": {"/api/v1/sessions/{session_id}/dashboard": {"post": {"requestBody": {"required": true, "content": {"appl

## Embedding Techniques

In [38]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [44]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')# to get the api key 

In [None]:
from langchain_openai import OpenAIEmbeddings
# embedding_model = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1056)
embedding_model

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x00000180637D70E0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001806456AB70>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [48]:
vector = embedding_model.embed_query('Hi meet cool AI Engineer Rahul')
len(vector)

1536

In [49]:
vector

[0.026116836816072464,
 -0.05655137822031975,
 -0.008108857087790966,
 -0.017020700499415398,
 -0.006904375273734331,
 -0.05876288190484047,
 0.010195307433605194,
 0.017376121133565903,
 -0.034910205751657486,
 0.036331888288259506,
 -0.00037537195021286607,
 -0.06924121081829071,
 -0.02568243443965912,
 -0.018889950588345528,
 0.038464415818452835,
 -0.019455989822745323,
 -0.05110159143805504,
 -0.027064625173807144,
 -0.004518449772149324,
 0.04802127927541733,
 0.003603570628911257,
 0.042413532733917236,
 0.033830780535936356,
 0.045230571180582047,
 0.028960201889276505,
 -0.024300241842865944,
 0.028828565031290054,
 0.006358080543577671,
 -0.029907990247011185,
 -0.032356444746255875,
 -0.006486427038908005,
 -0.031461309641599655,
 -0.028012413531541824,
 0.01277539785951376,
 0.02652491256594658,
 0.012545033358037472,
 -0.010359854437410831,
 -0.008385295048356056,
 -0.029855335131287575,
 0.032119497656822205,
 0.02147004008293152,
 -0.031171709299087524,
 -0.0037648263387

In [58]:
from langchain_community.document_loaders import WikipediaLoader
wp_chels = WikipediaLoader('Chelsea FC', load_max_docs=1, doc_content_chars_max=150).load()

from langchain_text_splitters import RecursiveCharacterTextSplitter
rct_splitter =  RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 10)
splitted_chels = rct_splitter.split_documents(wp_chels)
splitted_chels

[Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won 

In [None]:
from langchain_openai import OpenAIEmbeddings
# embedding_model_200 = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=200)


## Vector Stores

We need to specify the chunks and the embedding model to the vector store

In [63]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(splitted_chels, embedding_model_200)
db

<langchain_community.vectorstores.chroma.Chroma at 0x18040972270>

In [64]:
splitted_chels

[Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won 

## Retreival

In [None]:
db.similarity_search('FIFA Club World Cup')

[Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won 

# Ollama

ollama run deepseek-r1

In [67]:
from langchain_community.embeddings import OllamaEmbeddings #embedding_model_ollama
embeddings = (OllamaEmbeddings(model='deepseek-r1')) #uses llama2 by default
embeddings

  embeddings = (OllamaEmbeddings(model='deepseek-r1')) #uses llama2 by default


OllamaEmbeddings(base_url='http://localhost:11434', model='deepseek-r1', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [69]:
e1 = embeddings.embed_documents([
    'Rahul is awesome',
    'Rahul likes football'
])

e1

[[-0.15448495745658875,
  -0.6926223635673523,
  -0.7453211545944214,
  0.947333037853241,
  1.057903528213501,
  -0.09228920191526413,
  -0.8756649494171143,
  1.5245290994644165,
  -0.3782190680503845,
  -1.001697301864624,
  0.9422664642333984,
  -0.8521125316619873,
  2.1606392860412598,
  -0.3868449926376343,
  -0.02920275740325451,
  -0.7415686845779419,
  0.7047206163406372,
  1.2753828763961792,
  -0.4011259377002716,
  -0.10657032579183578,
  -0.8795906901359558,
  2.0669057369232178,
  0.15557166934013367,
  -0.09791991859674454,
  0.4669077396392822,
  0.9069708585739136,
  3.3044443130493164,
  -0.42921754717826843,
  0.047935355454683304,
  0.7128990292549133,
  0.998235285282135,
  1.334089994430542,
  1.3867683410644531,
  -0.08808445930480957,
  -1.022193431854248,
  -0.6181337833404541,
  0.2825552225112915,
  0.6603310704231262,
  1.4711894989013672,
  2.3950836658477783,
  -0.2039232701063156,
  0.6041474938392639,
  0.6310683488845825,
  -1.2578717470169067,
  0.554

In [70]:
len(e1[0])

4096

In [71]:
embeddings.embed_query('Rahul loves Chelsea')

[-0.6666659712791443,
 -1.37960946559906,
 -0.4200440049171448,
 1.8376914262771606,
 -1.7352780103683472,
 0.5450518131256104,
 -1.2318446636199951,
 -0.9398179650306702,
 0.7055336833000183,
 0.5176600813865662,
 -1.0402992963790894,
 0.9362249970436096,
 1.072624921798706,
 -1.510810375213623,
 -0.030678771436214447,
 -0.2862352430820465,
 -0.26879337430000305,
 1.6082559823989868,
 0.3417363464832306,
 0.2645445168018341,
 0.16070134937763214,
 1.2600237131118774,
 0.29692065715789795,
 0.1268613636493683,
 2.2902066707611084,
 -0.6457675099372864,
 5.587779521942139,
 -0.9745488166809082,
 -1.4508419036865234,
 0.2747668921947479,
 0.7315395474433899,
 2.3513102531433105,
 3.1382012367248535,
 -0.410849928855896,
 -1.0621951818466187,
 -0.10163719207048416,
 2.4061834812164307,
 0.7427308559417725,
 0.4570517838001251,
 0.222525954246521,
 -1.569640874862671,
 -0.9364069104194641,
 -0.6357571482658386,
 0.4663279056549072,
 0.2727792263031006,
 -1.3353880643844604,
 0.691948056221

Good Embedding Model - ollama pull mxbai-embed-large<br>
Link : <a href="https://ollama.com/blog/embedding-models"> Ollama Embedding Models </a>

In [74]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_large_embedding_model = OllamaEmbeddings(model="mxbai-embed-large")
ollama_large_embedding_model.embed_query("Wow")

[0.5712276697158813,
 -0.28125476837158203,
 -0.21544502675533295,
 0.8511478304862976,
 -1.0221401453018188,
 -0.47920405864715576,
 0.20080439746379852,
 0.5973071455955505,
 0.42919713258743286,
 0.48286235332489014,
 0.04554199427366257,
 0.28006577491760254,
 0.26639801263809204,
 -0.28517675399780273,
 -0.22017334401607513,
 -0.5160816311836243,
 -0.4513038396835327,
 -0.25922393798828125,
 -0.37257611751556396,
 0.26101332902908325,
 -0.10229511559009552,
 0.3866146504878998,
 -1.100141167640686,
 -0.013583267107605934,
 0.040744125843048096,
 0.19068533182144165,
 -0.02526649460196495,
 0.20072044432163239,
 0.7828330993652344,
 1.4152724742889404,
 -0.4870462119579315,
 0.17929324507713318,
 -0.043616071343421936,
 -0.7803052663803101,
 -0.4559413194656372,
 -0.7714446783065796,
 0.47561150789260864,
 -0.6244851350784302,
 -0.11110685765743256,
 -0.21775318682193756,
 -0.11477459967136383,
 -0.019099391996860504,
 0.5587961673736572,
 -0.8350534439086914,
 -0.7821701169013977,

In [75]:
vec = ollama_large_embedding_model.embed_query("Rahul Loves Chelsea")
len(vec)

1024

In [80]:
import  os
from dotenv import load_dotenv
load_dotenv()
# os.environ['HF_TOKEN']

True

### Hugging Face Embeddings

In [97]:
from langchain_huggingface import HuggingFaceEmbeddings
hf_embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
hf_embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
len(hf_embeddings.embed_query("Rahul loves Chelsea FC"))

384

In [123]:
from langchain_community.document_loaders import WikipediaLoader
chelsea_doc = WikipediaLoader(query="Chelsea FC", load_max_docs=2).load()
rct_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 20) # 
chelsea_splitted = rct_splitter.split_documents(chelsea_doc)
chelsea_splitted

[Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won 

In [124]:
chelsea_splitted[0].page_content

'Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier'

In [None]:
# Embedding a whole document
hf_embeddings.embed_documents([doc.page_content for doc in chelsea_splitted])

[[0.012255469337105751,
  -0.11888374388217926,
  -0.008558960631489754,
  -0.06582336127758026,
  0.02170279435813427,
  0.005050234496593475,
  0.012982677668333054,
  0.031992558389902115,
  0.04131972789764404,
  0.029696149751544,
  -0.01494476106017828,
  -0.027559300884604454,
  0.052869848906993866,
  -0.025943418964743614,
  -0.01797466166317463,
  0.03139641135931015,
  -0.07382793724536896,
  -0.10019196569919586,
  0.0006222152733244002,
  -0.04261394962668419,
  -0.004663946107029915,
  -0.07970238476991653,
  -0.025819899514317513,
  0.024766532704234123,
  -0.0560879185795784,
  0.02573341131210327,
  0.03902921825647354,
  0.018001163378357887,
  -0.037377603352069855,
  0.02166208252310753,
  0.03390992805361748,
  0.004990147892385721,
  0.12867455184459686,
  0.027960380539298058,
  -0.05341865122318268,
  -0.006767567712813616,
  0.07348272204399109,
  0.025489728897809982,
  0.05128934606909752,
  0.03822258114814758,
  -0.08584585785865784,
  -0.11507318913936615,

## Embedding and Storing in a Vector Database

In [None]:
# Creating a vector store for the whole document after embedding it !!!!
from langchain_community.vectorstores import Chroma
chelsea_db = Chroma.from_documents(documents=chelsea_splitted,embedding= hf_embeddings, collection_name='ChelseaFC')
chelsea_db.similarity_search('who is manager', k=1)

[Document(metadata={'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won the UEFA Europa League tw

In [1]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Creatign a document using Wikipedia loader for ChelseaFC
chelsea_doc = WikipediaLoader(query="Chelsea FC", load_max_docs=10).load()

# Creating chunks of 200 dimension 
rct_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 20)
chelsea_splitted = rct_splitter.split_documents(chelsea_doc)

# Chelsea_splitted now contains the chunks of 200 size
chelsea_splitted[0]

Document(metadata={'title': 'Chelsea F.C.', 'summary': "Chelsea Football Club is a professional football club based in London, England. The club was founded in 1905 and named after neighbouring area Chelsea. They compete in the Premier League, the top tier of English football, playing their home games at Stamford Bridge. Since 2022, the club has been owned by BlueCo.\nChelsea won their first major domestic trophy, the First Division championship, in 1955. They won their first Premier League title in the 2004–05 season under José Mourinho. In total, Chelsea have won six top-flight league titles. They have also won eight FA Cups, five League Cups, and four FA Community Shields, making them the fifth-most successful club in English football.\nAt international level, Chelsea won their first European trophy in 1971, lifting the Cup Winners' Cup, which they won again in 1998. They went on to win their first UEFA Champions League title in 2012 and repeated the feat in 2021. Chelsea have won t

In [4]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

ollama_large_embedding_model = OllamaEmbeddings(model="mxbai-embed-large")
chelsea_faiss_db = FAISS.from_documents(chelsea_splitted, ollama_large_embedding_model)