In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
from tqdm.auto import tqdm
import ast
import os
import time

In [None]:
from UDCUtils import UDCUtils
utils = UDCUtils()

In [None]:
pinecone_api_key = utils.get_pinecone_api_key()

In [None]:
if pinecone_api_key==None:
    pinecone_api_key = "a85f0b7c-6ce3-458d-aa18-168f39e4420b" ##default key

In [None]:
pinecone_obj = Pinecone(api_key=pinecone_api_key)

index_name = utils.create_dlai_index_name('dl-ai-001')
print(index_name)

In [None]:
index_name = index_name[:-3]

if index_name in pinecone_obj.list_indexes():
    print(1)
    pinecone_obj.delete_index(index_name)

pinecone_obj.create_index(
    name=index_name,
    dimension=1536,
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [None]:
index = pinecone_obj.Index(index_name)

In [None]:
#!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

In [None]:
#!unzip lesson2-wiki.csv.zip

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("wiki.csv")

In [None]:
data.head()

In [None]:
max_articles_num = 500

In [None]:
batch_size=300
max_len=4194303
prepped = []
for id, row in tqdm(data.iterrows(), total=max_articles_num):
    id = row['id']
    values = ast.literal_eval(row['values'])
    metadata = ast.literal_eval(row['metadata'])
    #print(f"{len(id)};{len(values)};{len(metadata)}")
    prepped.append({
        'id':row['id'],
        'values':values,
        'metadata':metadata
    })
    if(len(prepped)==batch_size):
        index.upsert(prepped)
        prepped=[]

index.upsert(prepped)

In [None]:
index.describe_index_stats()

In [None]:
open_api_key = utils.get_openai_api_key()

openai_client = OpenAI(api_key=open_api_key)

In [None]:
def get_embeddings(articles, model="text-embedding-ada-002"):
    return openai_client.embeddings.create(input=articles, model=model)

In [None]:
query = "When did India won its first cricket world cup match?"

embedding = get_embeddings([query])

results = index.query(vector=embedding.data[0].embedding, top_k=3, include_metadata=True)

text = [rec['metadata']['text'] for rec in results['matches']]

print('\n'.join(text))

In [None]:
##prompt template
from langchain.prompts import ChatPromptTemplate
template_style = """
Answer the following question based on the context provided in the text enclosed in three backticks.
Question: {query}
Context:```{context}```
Answer:
"""
print(template_style)

In [None]:
prompt_template = ChatPromptTemplate.from_template(template_style)

In [None]:
prompt_template.messages[0].prompt.input_variables

In [None]:
query = "When did India won its first cricket world cup match?"
context_lst = [rec['metadata']['text'] for rec in results['matches']]
context = "\n".join(context_lst)
prompt = prompt_template.format_messages(
    context=context,
    query=query
)

In [None]:
print(type(prompt))

In [None]:
print(len(prompt
         ))

In [None]:
print(type(prompt[0]))

In [None]:
from langchain_core.messages.human import HumanMessage

message = prompt[0]

#print(message)

print(type(message))

In [None]:
print(type(message.content))

In [None]:
response = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=message.content,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)

In [None]:
print(response.choices[0].text)