In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ['PINECONE_KEY']=os.getenv("PINECONE_KEY")
pinecone_api_key=os.getenv('PINECONE_KEY')

os.environ['HUGGINGFACEHUB_API_TOKEN']=os.getenv("HUGGINGFACEHUB_API_TOKEN")
hf_api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN')


In [None]:
%pip install   langchain-community langchain-huggingface

In [3]:
%pip install --upgrade --quiet  pinecone-client pinecone-text pinecone-notebooks

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [3]:
import os

from pinecone import Pinecone, ServerlessSpec

index_name = "ctgov-pinecone-hybrid-search"

# initialize Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

# create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384 ,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

  from tqdm.autonotebook import tqdm


In [4]:
index = pc.Index(index_name)

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_embeedings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
hf_embeedings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [6]:
from pinecone_text.sparse import BM25Encoder

# use default tf-idf values
bm25_encoder = BM25Encoder().default()

In [7]:
corpus = [
"Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms",
"A Single Group Study to Evaluate the Effects of a Probiotic Supplement on Vaginal Health.",
"Clinical Study of FB1001 in Patients",
"Study of the Effect of HFA-152a and HFA-134a Propellants on Mucociliary Clearance in Healthy Participants",
"Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects",
"Effectiveness of Qualia NAD+ Supplementation on Intracellular NAD Levels",
"A Clinical Trial of Hepalatide for Injection in Patients With Chronic Hepatitis D",
"A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors",
"A Clinical Trial to Assess the Impact of an Alcohol Alternative Herbal Tincture on Signs of Stress Anxiety and Sleep Quality.",
"Phase 1/2 Study of Autologous SCG142 TCR T Cells in Patients With HPV16/52-positive Carcinoma"
]
bm25_encoder.fit(corpus)
bm25_encoder.dump("bm25_values.json")
bm25_encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 10/10 [00:00<00:00, 1101.79it/s]


In [8]:
retriever = PineconeHybridSearchRetriever(
    embeddings=hf_embeedings, sparse_encoder=bm25_encoder, index=index
)

In [10]:
retriever.add_texts( [
"Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms",
"A Single Group Study to Evaluate the Effects of a Probiotic Supplement on Vaginal Health.",
"Clinical Study of FB1001 in Patients",
"Study of the Effect of HFA-152a and HFA-134a Propellants on Mucociliary Clearance in Healthy Participants",
"Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects",
"Effectiveness of Qualia NAD+ Supplementation on Intracellular NAD Levels",
"A Clinical Trial of Hepalatide for Injection in Patients With Chronic Hepatitis D",
"A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors",
"A Clinical Trial to Assess the Impact of an Alcohol Alternative Herbal Tincture on Signs of Stress Anxiety and Sleep Quality.",
"Phase 1/2 Study of Autologous SCG142 TCR T Cells in Patients With HPV16/52-positive Carcinoma"
])

100%|██████████| 1/1 [00:03<00:00,  3.21s/it]


In [11]:
retriever.invoke("What are the Hepatities trials?")

[Document(page_content='A Clinical Trial of Hepalatide for Injection in Patients With Chronic Hepatitis D'),
 Document(page_content='A Clinical Trial to Assess the Impact of an Alcohol Alternative Herbal Tincture on Signs of Stress Anxiety and Sleep Quality.'),
 Document(page_content='Study of the Effect of HFA-152a and HFA-134a Propellants on Mucociliary Clearance in Healthy Participants'),
 Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects')]

In [12]:
retriever.invoke("Phase 1 titles")

[Document(page_content='Phase 1/2 Study of Autologous SCG142 TCR T Cells in Patients With HPV16/52-positive Carcinoma'),
 Document(page_content='Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms'),
 Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects'),
 Document(page_content='Clinical Study of FB1001 in Patients')]

In [13]:
retriever.invoke("Phase 1/2 titles")

[Document(page_content='Phase 1/2 Study of Autologous SCG142 TCR T Cells in Patients With HPV16/52-positive Carcinoma'),
 Document(page_content='Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms'),
 Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects'),
 Document(page_content='A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors')]

In [14]:
retriever.invoke("Extract all the cancer drugs")

[Document(page_content='A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors'),
 Document(page_content='Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms'),
 Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects'),
 Document(page_content='Effectiveness of Qualia NAD+ Supplementation on Intracellular NAD Levels')]

In [15]:
retriever.invoke("Extract all the non cancer drugs")

[Document(page_content='A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors'),
 Document(page_content='Safety and Efficacy of KLS-1 Monotherapy in Malignant Neoplasms'),
 Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects'),
 Document(page_content='Effectiveness of Qualia NAD+ Supplementation on Intracellular NAD Levels')]

In [18]:
%%time
retriever.invoke("Titles with the tablets")

CPU times: total: 719 ms
Wall time: 292 ms


[Document(page_content='Investigation of Optimal Dosage Regimen for HRS9531 Tablets in Healthy Subjects'),
 Document(page_content='A Study of BL-M14D1 in Patients With Locally Advanced or Metastatic Small Cell Lung Cancer, Neuroendocrine Tumors and Other Solid Tumors'),
 Document(page_content='Clinical Study of FB1001 in Patients'),
 Document(page_content='A Clinical Trial to Assess the Impact of an Alcohol Alternative Herbal Tincture on Signs of Stress Anxiety and Sleep Quality.')]