In [1]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "ourData")

In [2]:
emb_dim = 384

In [3]:
from vespa.package import Field
from vespa.package import HNSW


app_package.schema.add_fields(        
    Field(
        name = "id", type = "string", 
        indexing = ["attribute", "summary"]
    ),
    Field(
        name = "content", type = "string", 
        indexing = ["index", "summary"], 
        index = "enable-bm25"
    ),
    Field(
        name = "content_bert", type = "tensor<float>(x[{}])".format(emb_dim), 
        indexing = ["attribute", "index"],
        ann=HNSW(
            distance_metric="angular",
            max_links_per_node=16,
            neighbors_to_explore_at_insert=500,
        )
    )        
)

In [4]:
from vespa.package import QueryTypeField

app_package.query_profile_type.add_fields(        
    QueryTypeField(
        name="ranking.features.query(content_bert)",
        type="tensor<float>(x[{}])".format(emb_dim)
    )
)

In [5]:
from vespa.package import RankProfile


app_package.schema.add_rank_profile(
    RankProfile(
        name = "bert_content", 
        first_phase = "sum(query(content_bert)*attribute(content_bert))"
    )
)

In [6]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker(port=8089)

app = vespa_docker.deploy(
    application_package = app_package,
)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Waiting for configuration server, 10/300 seconds...
Waiting for application status, 0/300 seconds...
Waiting for application status, 5/300 seconds...
Waiting for application status, 10/300 seconds...
Waiting for application status, 15/300 seconds...
Waiting for application status, 20/300 seconds...
Waiting for application status, 25/300 seconds...
Finished deployment.


In [7]:
from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [8]:
import numpy as np

def normalized_bert_encoder(text):
    vector = bert_model.encode([text])[0].tolist()
    norm = np.linalg.norm(vector)
    if norm > 0.0:
        vector = vector / norm
    return vector.tolist()

In [9]:
from pandas import read_csv

docs = read_csv("test1.csv")
docs = docs[100:500]
docs.shape

(400, 2)

In [10]:
ids = [i for i in range(docs.shape[0])]
docs['id'] = ids
docs.shape

(400, 3)

In [11]:
docs.head(2)

Unnamed: 0.1,Unnamed: 0,content,id
100,100,""" Team of six"" Our lean âTeam of sixâ appr...",0
101,101,""" The Reckoning demonstrates how financial tra...",1


In [12]:
for idx, row in docs.iterrows():
    response = app.feed_data_point(
        schema = "ourData",
        data_id = row["id"],
        fields = {
            "id": row["id"],
            "content": row["content"],
            "content_bert": {"values": normalized_bert_encoder(row["content"])}
        }
    )

In [13]:
from vespa.query import QueryModel, QueryRankingFeature, Union, WeakAnd, ANN, RankProfile

query_model = QueryModel(
    query_properties=[QueryRankingFeature(name="content_bert", mapping=normalized_bert_encoder)],
    match_phase=Union(
        WeakAnd(field="content", hits=10), 
        ANN(
            doc_vector="content_bert", 
            query_vector="content_bert", 
            hits=10, 
            label="ann_content"
        )
    ),
    rank_profile=RankProfile(name="bert_content")
)

In [14]:
query_results = app.query(query="Team of six", query_model=query_model, debug_request=False)

In [15]:
query_results.hits

[{'id': 'id:ourData:ourData::0',
  'relevance': 0.5045033693313599,
  'source': 'ourData_content',
  'fields': {'sddocname': 'ourData',
   'documentid': 'id:ourData:ourData::0',
   'id': '0',
   'content': '" Team of six" Our lean â\x80\x9cTeam of sixâ\x80\x9d approach to conversion projects has also proved its worth. The principle is to put together a team with an expert from each of six domains: project management, core, development, finance/controlling, business partner/logistics and â\x80\x9cmiscellaneousâ\x80\x9d modules. Other experts, e.g. for interfaces, BI, permissions or forms, can be brought in as needed. The small core team is able to work with low resource requirements and efficient pathways. The result is top-quality projects in a short timeframe. Opportunity for further business optimization We also have competence in the fields of Managed Services, Analytics and HR, and can weave these themes into your project in the form of IT architecture, BI strategy, digitalizing HR

In [16]:
query_str = docs[:1].content.to_string()[7:]

In [17]:
type(query_str)

str

In [18]:
query_str

'" Team of six" Our lean â\x80\x9cTeam of sixâ\x80\x9d appr...'

In [19]:
query_res = app.query(query=query_str, query_model=query_model, debug_request=False)

In [20]:
query_res.hits

[]

In [21]:
query_res.json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 0},
  'errors': [{'code': 4,
    'summary': 'Invalid query parameter',
    'message': 'Could not create query from YQL: query:L1:81 no viable alternative at input \'([{"targetNumHits": 10}]weakAnd(content contains """, content contains "\'',
    'stackTrace': 'com.yahoo.processing.IllegalInputException: com.yahoo.search.yql.ProgramCompileException: query:L1:81 no viable alternative at input \'([{"targetNumHits": 10}]weakAnd(content contains """, content contains "\'\n\tat com.yahoo.search.yql.YqlParser.parseYqlProgram(YqlParser.java:818)\n\tat com.yahoo.search.yql.YqlParser.parse(YqlParser.java:274)\n\tat com.yahoo.search.yql.MinimalQueryInserter.insertQuery(MinimalQueryInserter.java:95)\n\tat com.yahoo.search.yql.MinimalQueryInserter.search(MinimalQueryInserter.java:80)\n\tat com.yahoo.search.Searcher.process(Searcher.java:134)\n\tat com.yahoo.processing.execution.Execution.process(Execution.java:112)\n\tat com

In [22]:
import re

query_str = "".join(x for x in query_str if x.isalpha() or x.isspace())


In [23]:
query_str

' Team of six Our lean âTeam of sixâ appr'

In [24]:
query_res = app.query(query=query_str, query_model=query_model, debug_request=False)

In [25]:
query_res.hits

[]

In [26]:
query_res.json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 0},
  'errors': [{'code': 4,
    'summary': 'Invalid query parameter',
    'message': 'Could not create query from YQL: The word of a word item can not be empty',
    'stackTrace': 'java.lang.IllegalArgumentException: The word of a word item can not be empty\n\tat com.yahoo.protect.Validator.ensureNonEmpty(Validator.java:20)\n\tat com.yahoo.prelude.query.WordItem.setWord(WordItem.java:75)\n\tat com.yahoo.prelude.query.WordItem.<init>(WordItem.java:62)\n\tat com.yahoo.prelude.query.WordItem.<init>(WordItem.java:45)\n\tat com.yahoo.search.yql.YqlParser.segment(YqlParser.java:1510)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1472)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1416)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1408)\n\tat com.yahoo.search.yql.YqlParser.instantiateLeafItem(YqlParser.java:1276)\n\tat com.yahoo.search.yql.

In [27]:
query_str = docs[1:2].content.to_string()[8:]

In [28]:
query_res = app.query(query=query_str, query_model=query_model, debug_request=False)

In [29]:
query_res.hits

[]

In [30]:
query_res.json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 0},
  'errors': [{'code': 4,
    'summary': 'Invalid query parameter',
    'message': 'Could not create query from YQL: The word of a word item can not be empty',
    'stackTrace': 'java.lang.IllegalArgumentException: The word of a word item can not be empty\n\tat com.yahoo.protect.Validator.ensureNonEmpty(Validator.java:20)\n\tat com.yahoo.prelude.query.WordItem.setWord(WordItem.java:75)\n\tat com.yahoo.prelude.query.WordItem.<init>(WordItem.java:62)\n\tat com.yahoo.prelude.query.WordItem.<init>(WordItem.java:45)\n\tat com.yahoo.search.yql.YqlParser.segment(YqlParser.java:1510)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1472)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1416)\n\tat com.yahoo.search.yql.YqlParser.instantiateWordItem(YqlParser.java:1408)\n\tat com.yahoo.search.yql.YqlParser.instantiateLeafItem(YqlParser.java:1276)\n\tat com.yahoo.search.yql.