In [69]:
!pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks
!pip install langchain-community --trusted-host
!pip install pinecone-client
!pip install transformers
!pip install tokenizers
!pip install datasets


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

--trusted-host option requires 1 argument




In [70]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
import getpass
import os
import time

In [71]:
## from Pinecone account 
## HF key will need to be copied into environment on terminal
my_api_key="6f84b448-698d-4495-b3cd-f50e9884756d"

In [72]:
from pinecone import Pinecone,ServerlessSpec
index_name="hybrid-search-langchain-pinecone" 

## initialize Pinecone client
## ----> SSL error calls for proxy and ssl verification to be false
pc=Pinecone(
    api_key=my_api_key, 
    proxy_url='http://127.0.0.1:9000/systemproxy-6a430247.pac',
    ssl_verify=False
)

## create index, populated onto Pinecone site
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='dotproduct',
        spec=ServerlessSpec(cloud='aws', region='us-east-1'),
    )





In [73]:
## get index object form pc agent
index=pc.Index(index_name)
index

<pinecone.data.index.Index at 0x164c7d2fce0>

In [74]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased'
)



In [75]:
import requests
from huggingface_hub import configure_http_backend


## this was an SLL workaround --> may not be necessary for all configs
def backend_factory() -> requests.Session:
    session = requests.Session()
    session.verify = False
    return session

configure_http_backend(backend_factory=backend_factory)

In [76]:
## vector embedding and sparse matrix ()
from dotenv import load_dotenv
load_dotenv()

## using HF embeddings 
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [77]:
## BM25 encoder
from pinecone_text.sparse import BM25Encoder

bm25=BM25Encoder().default
bm25
this_bm = bm25()
this_bm

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x164b00c1100>

In [78]:
## error handling import for sentences
import nltk
nltk.download('punkt_tab')

sentences=[
    "In 2019 I graduated in high school",
    "In 2023 I graduated college",
    "In 2024 I started work"
]
##tfidf values on these sentences
this_bm.fit(sentences)

## store the values to a json file
this_bm.dump("bm25_values.json")

## load to BM25Encoder object
this_bm = BM25Encoder().load("bm25_values.json")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DJ427YT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
100%|██████████| 3/3 [00:00<00:00, 2995.93it/s]


In [79]:
## build retriever using Pinecone's HS 
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=this_bm, index=index)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x00000164B0110770>, index=<pinecone.data.index.Index object at 0x00000164C7D2FCE0>)

In [80]:
retriever.add_texts(
    [
    "In 2019 I graduated in high school",
    "In 2023 I graduated college",
    "In 2024 I started work"
    ]
)

100%|██████████| 1/1 [00:00<00:00,  1.33it/s]


In [81]:
retriever.invoke("What did I do last")



[Document(metadata={'score': 0.162696287}, page_content='In 2024 I started work'),
 Document(metadata={'score': 0.108049192}, page_content='In 2023 I graduated college'),
 Document(metadata={'score': 0.0991076902}, page_content='In 2019 I graduated in high school')]

In [82]:
retriever.invoke("What did I do last")



[Document(metadata={'score': 0.162696287}, page_content='In 2024 I started work'),
 Document(metadata={'score': 0.108049192}, page_content='In 2023 I graduated college'),
 Document(metadata={'score': 0.0991076902}, page_content='In 2019 I graduated in high school')]

In [83]:
retriever.invoke("When did I graduate high school")



[Document(metadata={'score': 0.570266247}, page_content='In 2019 I graduated in high school'),
 Document(metadata={'score': 0.349998623}, page_content='In 2023 I graduated college'),
 Document(metadata={'score': 0.150219992}, page_content='In 2024 I started work')]

In [84]:
retriever.invoke("When did I graduate college")



[Document(metadata={'score': 0.581695557}, page_content='In 2023 I graduated college'),
 Document(metadata={'score': 0.384112954}, page_content='In 2019 I graduated in high school'),
 Document(metadata={'score': 0.173161224}, page_content='In 2024 I started work')]

In [85]:
retriever.invoke("When did I start high school")



[Document(metadata={'score': 0.456700087}, page_content='In 2019 I graduated in high school'),
 Document(metadata={'score': 0.253886878}, page_content='In 2024 I started work'),
 Document(metadata={'score': 0.248470828}, page_content='In 2023 I graduated college')]