In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os

### Setup APIs

In [50]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [51]:
# INDEX_NAME = utils.create_dlai_index_name('dl-ai')
# pinecone = Pinecone(api_key=PINECONE_API_KEY)

# if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
#   pinecone.delete_index(INDEX_NAME)

# pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
#   spec=ServerlessSpec(cloud='aws', region='us-east-1'))

# index = pinecone.Index(INDEX_NAME)

In [52]:
# pinecone.list_indexes()

In [54]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

index = utils.get_pinecone_index(INDEX_NAME, pinecone)

### Load data

In [55]:
data_csv = "E:\data\dl_ai_data\\all-the-news-3.csv"

In [56]:
with open(data_csv, 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [57]:
df = pd.read_csv(data_csv, nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [58]:
df.iloc[0]

date                                         2016-12-09 18:31:00
year                                                        2016
month                                                       12.0
day                                                            9
author                                               Lee Drutman
title          We should take concerns about the health of li...
article        This post is part of Polyarchy, an independent...
url            https://www.vox.com/polyarchy/2016/12/9/138983...
section                                                      NaN
publication                                                  Vox
Name: 0, dtype: object

### Embedd titles

In [59]:
def get_embeddings(text, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = text, model=model)

In [60]:
CHUNK_SIZE=400
TOTAL_ROWS=10000

progress_bar = tqdm(total=TOTAL_ROWS)

chunks = pd.read_csv(data_csv, chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)
chunk_num = 0

for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    
    prepped = [
        {
            'id':str(chunk_num*CHUNK_SIZE+i),
            'values':embeddings.data[i].embedding,
            'metadata': {
                'title':titles[i]
            },
            } for i in range(0,len(titles))
    ]
    
    chunk_num = chunk_num + 1
    
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    
    progress_bar.update(len(chunk))

100%|█████████████████████████████████████████████████████████████████████| 10000/10000 [02:33<00:00, 69.85it/s]

In [61]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

### Recommender

In [62]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

In [63]:
reco = get_recommendations(index, 'lucian')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.810361087 : 'Lucy's Crush,' Today's Comic by Akvile Magicdust
0.793873727 : A disciple of Brazil's dictatorship moves closer to the presidency
0.789339423 : Blaise Cendrars: A Poet for the Twenty-First Century
0.788513303 : The Incredible Prescience of Leonardo da Vinci
0.784133732 : Liliana Porter Shows How Everything Familiar Must Be Magnified or Forgotten
0.782182634 : Arto Lindsay Explores the Brazilian Avant-Garde at the Whitney Museum
0.782138944 : The Halcyon Days of Postcolonial Mali Through the Lens of Malick Sidibé
0.78121978 : Ricotta Pansotti 
0.781023145 : Larry Fink Illuminates the Drama of Boxing
0.780783474 : Sun Ra Had It Right


100%|█████████████████████████████████████████████████████████████████████| 10000/10000 [02:50<00:00, 69.85it/s]

### Embedd articles

In [64]:
# reset index and embedd articles
index = utils.get_pinecone_index(INDEX_NAME, pinecone)

In [69]:
def embed_articles(embeddings, title, prepped, embed_num):
    for embedding in embeddings.data:
        prepped.append(
            {
                'id':str(embed_num), 
                'values':embedding.embedding, 
                'metadata':{'title':title}
            }
        )
        embed_num += 1
        
        if len(prepped) >= 100:
            index.upsert(prepped)
            prepped.clear()

    return embed_num

In [67]:
news_data_rows_num = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400, 
    chunk_overlap=20
) # how to chunk each article

df = pd.read_csv(data_csv, nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

In [70]:
prepped = []
embed_num = 0 #keep track of embedding number for 'id'

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed_articles(embeddings, title, prepped, embed_num)

....................................................................................................

In [None]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'