In [1]:
import pandas as pd
from tqdm import tqdm

# Load the Data

In [2]:
dataset = ""
articles = pd.read_pickle(dataset)
articles = articles[['paragraph_id', 'article_id', 'relevancy_rank', 
                    'brand', 'title', 'text', 'minilm_embeddings']]

articles = articles.rename(columns={
    'article_id': 'parent_id',
    'paragraph_id': 'id',
    'minilm_embeddings': 'embedding'
})

print(articles.shape)
articles.head(2)

(97685, 7)


Unnamed: 0,id,parent_id,relevancy_rank,brand,title,text,embedding
0,46683031823_6_Google,46683031823,2,Google,Governor Ron DeSantis accompanied wife to canc...,Everyone who reads our reporting knows the Gel...,"[-0.016234327, 0.096927114, -0.0736138, 0.0110..."
1,46686258924_0_Google,46686258924,1,Google,How to enable 2-step verification on your Goog...,Proceed to the next point and use the phone to...,"[-0.07577787, 0.05864354, 0.061040197, -0.0203..."


In [3]:
articles.to_pickle('../data/articles.pkl')

# Jina

## Setup the DocArray

In [4]:
from docarray import DocumentArray, Document

In [6]:
da = DocumentArray(storage='weaviate', config={'n_dim': 384})

for parent_id in tqdm(articles['parent_id'].unique()):
    d = Document(
        id = str(parent_id),
        chunks = DocumentArray.from_dataframe(articles[articles['parent_id']==parent_id])
    )
    da.append(d)

da.summary()

100%|██████████| 35449/35449 [05:54<00:00, 99.90it/s] 


In [7]:
da['46683031823']

In [7]:
# da['@c'].plot_embeddings()

## Nearest Neighbor Query

In [8]:
def print_results(d):
    print(f"✅ ID: {d.id}. Rel rank: {d.tags['relevancy_rank']}. Euclidean distance: {d.scores['euclidean'].value:.4f}. Title: {d.tags['title']}")
    print(f"  Text Chunk: {d.text}\n")

In [10]:
# Query
query = da['@r[9]c']
results = da['@c'].find(query, metric='euclidean', limit=5, exclude_self=True)

# Print Results
for idx, doc in enumerate(query):
    print('🔎 Query:')
    print(f"ID: {doc.id}. Title: {doc.tags['title']}")
    print(f"Text Chunk: {doc.text}")
    print('\n📝 Results:')

    if isinstance(results, list): 
        for d in results[idx]:
            print_results(d)
    else:
        for d in results:
            print_results(d)
            
    print('==========================================================================================\n')

🔎 Query:
ID: 46695367845_1_Google. Title: Samsung is adding Stadia and GeForce Now game streaming to its TVs in 2022
Text Chunk: The new Gaming Hub that can be found on Samsung’s 2022 TVs will give users quick access to a few different cloud gaming platforms ::: It will be interesting to see how cloud gaming services increase in popularity as they begin to integrate into more devices ::: Have any thoughts on this? Let us know down below in the comments or carry the discussion over to our Twitter or Facebook ::: Samsung had a pretty big announcement ahead of this year’s Consumer Electronics Show (CES). The company unveiled a few new smart TVs that will have support for cloud gaming services Google Stadia and Nvidia GeForce Now ::: It’s cool to see the popularity of cloud gaming starting to rise. Now, users can access cool gaming services like Google Stadia and Nvidia GeForce Now without having to purchase any dedicated hardware ::: The Hub will have support for Nvidia GeForce Now, Googl

In [None]:
# Setup a prototype endpoint for Google articles only
# Setup a production endpoint for all articles