In [1]:
!pip install -qU \
    openai \
    "pinecone-client[grpc]"\
    langchain \
    tiktoken \
    sentence_transformers

In [15]:
pip install python-dotenv pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting pandas
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2023.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
documents = []

with open('TrainData/train.jsonl', 'r') as f:
    for line in f:
        documents.append(json.loads(line))

len(documents)

7

In [40]:
documents[0]

{'id': 'bd488e7e4462-0',
 'text': 'embeddings operations.\nAugment ed data included with pr ompts . When using the "on your data" feature,\nthe service retrieves relevant data from a configured data store and augments the\nprompt to produce generations that are grounded with your data.\nTraining & v alidation data . You can provide your own training data consisting of\nprompt-completion pairs for the purposes of fine-tuning an OpenAI model .\nThe diagram below illustrates how your data is processed. This diagram covers three\ndifferent types of processing:\n1. How the Azure OpenAI Service processes your prompts to generate content\n(including when additional data from a connected data source is added to a\nprompt using Azure OpenAI on your data).\n2. How the Azure OpenAI Service creates a fine-tuned (custom) model with your\ntraining data.\n3. How the Azure OpenAI Service and Microsoft personnel analyze prompts,\ncompletions and images for harmful content and for patterns suggesting th

In [3]:
index_name = 'langchain-retrieval-agent-sample'

In [4]:
import pinecone
import os

# Load Pinecone API key
api_key = os.getenv('PINECONE_API_KEY') or 'YOUR_API_KEY'
# Set Pinecone environment. Find next to API key in console
env = os.getenv('PINECONE_ENVIRONMENT') or "YOUR_ENV"

pinecone.init(api_key=api_key, environment=env)

  from tqdm.autonotebook import tqdm


In [24]:
import time

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# we create a new index
pinecone.create_index(
    name=index_name,
    metric='cosine',
    dimension=768  # 1536 dim of text-embedding-ada-002
)

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

In [25]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
hf_embeddings = HuggingFaceEmbeddings()


In [22]:
len(res[0])

768

In [41]:
from tqdm.auto import tqdm
from uuid import uuid4
import pandas as pd

batch_size = 100

texts = []
metadatas = []

data = pd.DataFrame(documents)
print(data.head(),documents[0])

for i in tqdm(range(0, len(data), batch_size)):
    # get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    # first get metadata fields for this record
    metadatas = [{
        'id': record['id'],
        'text': record['text']
    } for j, record in batch.iterrows()]
    # get the list of contexts / documents
    docs = batch['text']
    # create document embeddings
    embeds = hf_embeddings.embed_documents(docs)
    # get IDs
    ids = batch['id']
    # add everything to pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

               id                                               text  \
0  bd488e7e4462-0  embeddings operations.\nAugment ed data includ...   
1  80c36771df0e-0  Models (base or fine-tuned) deployed in your r...   
2  204f06379f97-0  Customers can upload their training data to th...   
3  204f06379f97-1  OpenAI resource). A separate data store is loc...   
4  3bea13b1ddc6-0  g p py p\nthe authorized Microsoft employees a...   

                           source  
0  SourceData\DPS_Openai_Docs.pdf  
1  SourceData\DPS_Openai_Docs.pdf  
2  SourceData\DPS_Openai_Docs.pdf  
3  SourceData\DPS_Openai_Docs.pdf  
4  SourceData\DPS_Openai_Docs.pdf   {'id': 'bd488e7e4462-0', 'text': 'embeddings operations.\nAugment ed data included with pr ompts . When using the "on your data" feature,\nthe service retrieves relevant data from a configured data store and augments the\nprompt to produce generations that are grounded with your data.\nTraining & v alidation data . You can provide your own training 

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.74s/it]


In [42]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 7}},
 'total_vector_count': 7}

In [7]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, hf_embeddings.embed_query, text_field
)

In [9]:
query = "How can customers get an exemption from abuse monitoring and human review"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)


[Document(page_content='g p py p\nthe authorized Microsoft employees are located in the European Economic Area.\nSome customers may want to use the Azure OpenAI Service for a use case that involves\nthe processing of sensitive, highly confidential, or legally-regulated input data but\nwhere the likelihood of harmful outputs and/or misuse is low. These customers may\nconclude that they do not want or do not have the right to permit Microsoft to process\nsuch data for abuse detection, as described above, due to their internal policies or\napplicable legal regulations. T o address these concerns, Microsoft allows customers who\nmeet additional Limited Access eligibility criteria and attest to specific use cases to\napply to modify the Azure OpenAI content management features by completing this\nform .\nIf Microsoft approves a customer\'s request to modify abuse monitoring, then Microsoft\ndoes not store any prompts and completions associated with the approved Azure\nsubscription for which

In [1]:
print("hello")

hello
