In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
import os
import time
import sys
import ast
#from tqdm.auto import tqdm, trange
from tqdm import tqdm
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from UDCUtils import UDCUtils
utils = UDCUtils()

In [None]:
pincone_api_key = utils.get_pinecone_api_key()
pincone_api_key

In [None]:
openai_api_key = utils.get_openai_api_key()
openai_api_key

In [None]:
if pincone_api_key==None:
    pincone_api_key = "a85f0b7c-6ce3-458d-aa18-168f39e4420b" ##default key
pincone_api_key

In [None]:
#!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

In [None]:
#!unzip all-the-news-3.zip

In [None]:
file_name = "all-the-news-3.csv"
df = pd.read_csv(file_name, nrows=100)
df.head()

In [None]:
#%pip install --upgrade --quiet tqdm

In [None]:
openai_client = OpenAI(api_key=openai_api_key)

index_name = utils.create_dlai_index_name("dev-001")
print(index_name)

In [None]:
pinecone_obj = Pinecone(api_key=pincone_api_key)

index_name = index_name[:-3]

if index_name in pinecone_obj.list_indexes():
    pinecone_obj.delete_index(index_name)

In [None]:
print(index_name)

In [None]:
pinecone_obj.create_index(name=index_name, dimension=1536, metric="cosine",
                         spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pinecone_obj.Index(index_name)

In [None]:
def get_embeddings(articles, model_name = "text-embedding-ada-002"):
    return (openai_client.embeddings.create(input=articles, model=model_name))

In [None]:
total_rows = 20000
chunksize=400
progress_bar = tqdm(total=total_rows)

chunks = pd.read_csv(file_name, chunksize=chunksize, nrows=total_rows)

prepped=[]
chunk_num = 0

for chunk in chunks:
    print(chunk_num)
    titles = chunk["title"].tolist()
    embeddings = get_embeddings(titles)
    prepped = [
        {"id": str(chunksize*chunk_num+i),
        "metadata": {"title":titles[i]},
        "values":embeddings.data[i].embedding} for i in range(len(titles))
    ]
    chunk_num = chunk_num + 1
    #print(len(prepped))
    if len(prepped)>=200:
        index.upsert(prepped)
        prepped=[]
    progress_bar.update(len(chunk))
#df.info()

if len(prepped)>0:
    index.upsert(prepped)

In [None]:
print(chunk_num)

In [None]:
index.describe_index_stats()

In [None]:
def get_recommendations(pinecone_index, search_term, top_k=10):
    embedding = get_embeddings([search_term]).data[0].embedding
    response = pinecone_index.query(vector=embedding, top_k = top_k, include_metadata = True)
    return response

In [None]:
response = get_recommendations(index, "president obama")
#print(response.matches.score)

for records in response.matches:
    print(f"{records.score}:{records.metadata["title"]}")

### we will search articles that contain these term

In [None]:
index_name = utils.create_dlai_index_name("dl-ai-002")
index_name = index_name[:-3]
print(index_name)

if index_name in pinecone_obj.list_indexes():
    pinecone_obj.delete_index(index_name)

pinecone_obj.create_index(name=index_name,
                         dimension=1536,
                         metric="cosine",
                         spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index_article = pinecone_obj.Index(index_name)

In [None]:
#%pip install ipywidgets==7.7

In [None]:
from tqdm import tqdm

In [None]:
from time import sleep
for i in tqdm(range(0,100)):
    sleep(0.01)

In [None]:
#help(index_article.delete)
index_article.delete(delete_all=True)

In [None]:
prepped = []

chunksize=200
total_rows=10000

print(file_name)
chunks = pd.read_csv(file_name, chunksize=chunksize, nrows=total_rows)

#define a text splitter for splitting articles into tokens
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunksize,
    chunk_overlap=20
)

embed_cnt=1
chunk_num=0
for chunk in chunks:
    print(chunk_num)
    titles = chunk["title"].to_list()
    #title_embedding = get_embeddings(titles)
    articles = chunk["article"].to_list()
    #print(len(titles))
    #print(len(articles))
    for i, article in tqdm(enumerate(articles)):
        #print(f"article>>>{i}")
        if article == None:
            articles[i] = ""
        #embeddings = get_embeddings(articles)
        else:
            embed_cnt = chunksize * chunk_num * i
            tokens = text_splitter.split_text(str(article))
            embeddings = get_embeddings(tokens)
            #print(f"embed_cnt:{embed_cnt}")
            #print(f"len(embeddings):{len(embeddings.data)}")
            prepped.extend([{
            "id": str(embed_cnt+j),
            "metadata": {"title":titles[i]},
            "values":embeddings.data[j].embedding} for j in range(len(tokens))
                      ])
            #print(len(prepped))
        if(len(prepped)>=200):
            print(f"Upserting in index for chunk {chunk_num}")
            index_article.upsert(prepped)
            prepped.clear()
        
    chunk_num = chunk_num + 1

In [None]:
index_article.describe_index_stats()

In [None]:
reco = get_recommendations(index_article, 'President Obama', top_k=10)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'