In [9]:
!pip install pandas pyarrow

Collecting pyarrow
  Downloading pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl (40.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-13.0.0


In [10]:
import pandas as pd
import numpy as np
import os

df = pd.read_parquet("short_articles.parquet")
df = df[df["text"].str.len() > 0].sample(frac=0.25)
df = df.reset_index()

In [11]:
## collect our ids for each article
ids = df["article_id"].tolist()
## collect the properties that we will attach to each vector
properties = df.apply(
    lambda r:{ 
        "url": r.url, 
        "title": r.title, 
        "title_len": r.title_len, 
        "text": r.text, 
        "text_len": r.text_len}
    , axis=1
).tolist()


In [34]:
from qwak.exceptions import QwakException
from qwak.vector_store import VectorStoreClient

## Create vector client and fetch collection
client = VectorStoreClient()

# Retrieve a collection or create a new one
collection_name = "wikipedia-vectorizer-demo"
try:
    collection = client.get_collection_by_name(collection_name)
except QwakException:
    collection = client.create_collection(
        name=collection_name,
        description="Indexing Wikipedia articles ",
        dimension=384,
        metric="cosine",
        vectorizer="sentence_transformer"  # The name of a deployed realtime model on Qwak
    )

In [None]:
data_len = -1
collection.upsert(
    ## List of the article ids
    ids=ids[:data_len],
    # Natural inputs
    natural_inputs=df['text'][:data_len].tolist(),
    ## List of dict of the article properties
    properties=properties[:data_len]
)

In [39]:
from qwak.vector_store import VectorStoreClient

## Search vector store using vector provided by model
search_results = collection.search(
    natural_input="Ducks", 
    top_results=3, 
    output_properties=["title", "title_len", "url"], 
    include_distance=True, 
    include_vector=False
)

[SearchResult(properties={'title': 'Bifröst', 'title_len': 7.0, 'url': 'https://simple.wikipedia.org/wiki/Bifr%C3%B6st'}, id='090ac4be-654b-46e1-964e-e4d9e28f3244', vector=None, distance=0.7651671), SearchResult(properties={'title': 'Commonwealth Games', 'title_len': 18.0, 'url': 'https://simple.wikipedia.org/wiki/Commonwealth%20Games'}, id='4d6d2830-98a0-430b-8863-2edbe4738637', vector=None, distance=0.8012285), SearchResult(properties={'title': 'Coelurosauria', 'title_len': 13.0, 'url': 'https://simple.wikipedia.org/wiki/Coelurosauria'}, id='d59e9fc4-48bc-4d55-83c1-3b9d9ffe1b2e', vector=None, distance=0.82558024)]


In [40]:
[print(x.properties, x.distance) for x in search_results]

{'title': 'Bifröst', 'title_len': 7.0, 'url': 'https://simple.wikipedia.org/wiki/Bifr%C3%B6st'} 0.7651671
{'title': 'Commonwealth Games', 'title_len': 18.0, 'url': 'https://simple.wikipedia.org/wiki/Commonwealth%20Games'} 0.8012285
{'title': 'Coelurosauria', 'title_len': 13.0, 'url': 'https://simple.wikipedia.org/wiki/Coelurosauria'} 0.82558024


[None, None, None]