In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
from tqdm.auto import tqdm
import ast
import os
import time

In [3]:
from UDCUtils import UDCUtils
utils = UDCUtils()

In [4]:
pinecone_api_key = utils.get_pinecone_api_key()

In [5]:
if pinecone_api_key==None:
    pinecone_api_key = "a85f0b7c-6ce3-458d-aa18-168f39e4420b" ##default key

In [6]:
pinecone_obj = Pinecone(api_key=pinecone_api_key)

index_name = utils.create_dlai_index_name('dl-ai-001')
print(index_name)

dl-ai-001-118d1-7f10-4f3c-a4a6-41d7caaf84cc:fx


In [8]:
index_name = index_name[:-3]

if index_name in pinecone_obj.list_indexes():
    print(1)
    pinecone_obj.delete_index(index_name)

pinecone_obj.create_index(
    name=index_name,
    dimension=1536,
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [9]:
index = pinecone_obj.Index(index_name)

In [None]:
#!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

In [None]:
#!unzip lesson2-wiki.csv.zip

In [10]:
import pandas as pd

In [11]:
data = pd.read_csv("wiki.csv")

In [12]:
data.head()

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


In [13]:
max_articles_num = 500

In [14]:
batch_size=300
max_len=4194303
prepped = []
for id, row in tqdm(data.iterrows(), total=max_articles_num):
    id = row['id']
    values = ast.literal_eval(row['values'])
    metadata = ast.literal_eval(row['metadata'])
    #print(f"{len(id)};{len(values)};{len(metadata)}")
    prepped.append({
        'id':row['id'],
        'values':values,
        'metadata':metadata
    })
    if(len(prepped)==batch_size):
        index.upsert(prepped)
        prepped=[]

index.upsert(prepped)

  0%|          | 0/500 [00:00<?, ?it/s]

{'upserted_count': 100}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [16]:
open_api_key = utils.get_openai_api_key()

openai_client = OpenAI(api_key=open_api_key)

In [17]:
def get_embeddings(articles, model="text-embedding-ada-002"):
    return openai_client.embeddings.create(input=articles, model=model)

In [18]:
query = "When did India won its first cricket world cup match?"

embedding = get_embeddings([query])

results = index.query(vector=embedding.data[0].embedding, top_k=3, include_metadata=True)

text = [rec['metadata']['text'] for rec in results['matches']]

print('\n'.join(text))

1901  2000 
1907 –  Colin Blythe takes 17 wickets for 48 runs against Northamptonshire at Northampton in one day. It is the best analysis ever recorded either for a county cricket match or a single day's bowling, and not bettered in first-class cricket until 1956.
1909 – The Alaska-Yukon-Pacific Exposition world's fair opens in Seattle, Washington.
1910 – Robert Falcon Scott's South Pole expedition leaves the United Kingdom.
1916 - Louis Brandeis becomes the first Jew to be appointed to the United States Supreme Court.
1918 – World War I: Battle for Belleau Wood begins –
1921 – Tulsa Race Riot: A race riot in Tulsa, Oklahoma kills at least 85 people.
1922 – Official founding of the Royal Ulster Constabulary.
1925 – Lou Gehrig of the New York Yankees played the first game in his record streak of 2,130 games in a row, an endurance record in Major League Baseball that stood until Cal Ripken, Jr. broke it in 1995.
1926 - In Poland, the National Assembly elects Ignacy Moscicki as President.

In [26]:
##prompt template
from langchain.prompts import ChatPromptTemplate
template_style = """
Answer the following question based on the context provided in the text enclosed in three backticks.
Question: {query}
Context:```{context}```
Answer:
"""
print(template_style)


Answer the following question based on the context provided in the text enclosed in three backticks.
Question: {query}
Context:```{context}```
Answer:



In [27]:
prompt_template = ChatPromptTemplate.from_template(template_style)

In [29]:
prompt_template.messages[0].prompt.input_variables

['context', 'query']

In [43]:
query = "When did India won its first cricket world cup match?"
context_lst = [rec['metadata']['text'] for rec in results['matches']]
context = "\n".join(context_lst)
prompt = prompt_template.format_messages(
    context=context,
    query=query
)

In [47]:
print(type(prompt))

<class 'list'>


In [48]:
print(len(prompt
         ))

1


In [53]:
print(type(prompt[0]))

<class 'langchain_core.messages.human.HumanMessage'>


In [57]:
from langchain_core.messages.human import HumanMessage

message = prompt[0]

#print(message)

print(type(message))

<class 'langchain_core.messages.human.HumanMessage'>


In [59]:
print(type(message.content))

<class 'str'>


In [60]:
response = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=message.content,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)

In [62]:
print(response.choices[0].text)

India won its first cricket world cup match in 1983.
