In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange

import pandas as pd
import time
import os

In [6]:
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')


In [3]:
with open('./all-the-news-3.csv', 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [5]:
df = pd.read_csv('./all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [7]:
openai_client = OpenAI()

pc = Pinecone(api_key=PINECONE_API_KEY)

# Set index name
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

# Check if index exists and delete it
if pc.has_index(INDEX_NAME):
    pc.delete_index(INDEX_NAME)

# List of indexes
print("List of indexes:")
print(pc.list_indexes())


if not pc.has_index(INDEX_NAME):
    # Create index
    pc.create_index(
        name=INDEX_NAME, 
        dimension=1536, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )
    
    # Access index
    index = pc.Index(INDEX_NAME)
    print(index)

List of indexes:
[]
<pinecone.db_data.index.Index object at 0x1230246e0>


In [9]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [11]:
CHUNK_SIZE=400
TOTAL_ROWS=10000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('./all-the-news-3.csv', chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}

In [13]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

In [14]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.849975586 : Barack Obama just stepped off the sidelines to defend Obamacare
0.847730279 : President Obama has a new plan to fight the opioid epidemic
0.84756279 : “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.846980929 : Obama: if you were fine with big government until it served black people, rethink your biases
0.845982969 : President Obama: Michelle & I Are Gonna Be Renters
0.84375006 : Vox Sentences: Obama got a warmer welcome in Hiroshima than the Japanese prime minister
0.84294188 : Obama meets with national security team on Syria, Islamic State
0.842331529 : Watch President Obama dance the tango in Argentina
0.840565503 : Barack Obama in talks to create shows for Netflix: New York Times
0.839662194 : Obama and Supreme Court Tag Team on Juvenile Justice Reform


In [15]:
if INDEX_NAME in [index.name for index in pc.list_indexes()]:
    pc.delete_index(name=INDEX_NAME)

pc.create_index(
    name=INDEX_NAME, 
    dimension=1536, 
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    )
)

articles_index = pc.Index(INDEX_NAME)

In [16]:
def embed(embeddings, title, prepped, embed_num):
  for embedding in embeddings.data:
    prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})
    embed_num += 1
    if len(prepped) >= 100:
        articles_index.upsert(prepped)
        prepped.clear()
  return embed_num

In [18]:
news_data_rows_num = 100

embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, 
    chunk_overlap=20) # how to chunk each article
prepped = []
df = pd.read_csv('./all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

....................................................................................................

In [19]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000,
 'vector_type': 'dense'}

In [20]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'

0.821533263 : Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.817749 : U.S. lawmakers ask for disclosure of number of Americans under surveillance
0.813659728 : NYPD Honcho Insulted by 'Hamilton' Star Lin-Manuel Miranda Celebrating Obama's Controversial Prisoner Release
0.80670166 : Trump keeping options open as Republican feud rages
0.806030273 : Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.801025391 : Michael Bloomberg Is Seriously Considering a Presidential Run
0.798828185 : The most revealing Republican ad of the election is an attack ad against Tim Kaine
0.79876709 : Exclusive: Trump considering fracking mogul Harold Hamm as energy secretary - sources
0.798339903 : The government official in charge of ethics just harshly condemned Trump’s plan
0.797668457 : Trump tells anti-abortion marchers he will support them
0.793273926 : Exclusive: China shuns U.S. request for talks on airline website dispute over Taiwan
0.792907774 : 