In [1]:
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
from pymilvus.model.sparse import BM25EmbeddingFunction

# there are some built-in analyzers for several languages, now we use 'en' for English.
analyzer = build_default_analyzer(language="en")

corpus = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

# analyzer can tokenize the text into tokens
tokens = analyzer(corpus[0])
print("tokens:", tokens)

tokens: ['artifici', 'intellig', 'found', 'academ', 'disciplin', '1956']


In [5]:
from docuverse.engines.search_engine import SearchEngine
config = "../experiments/milvus_bm25/fiqa_es_setup.yaml"
el_server = SearchEngine(config)

In [6]:
milvus_server = "../experiments/milvus_bm25/fiqa_setup.yaml"
ms_server = SearchEngine(milvus_server)

In [7]:
def get_tokens(text, is_question=False):
    res = el_server.retriever.client.indices.analyze(index="beir-fiqa-es-bm25-512-100-08292024", 
                                                     analyzer="text_no_stop" if not is_question else "text_stop", text=text)
    return [r['token'] for r in res["tokens"]]

In [9]:
get_tokens("Full disclosure: I\u2019m an intern for EquityZen, so I\u2019m familiar with this space but can speak with the most accuracy about EquityZen. Observations about other players in the space are my own. The employee liquidity landscape is evolving. EquityZen and Equidate help shareholders (employees, ex-employees, etc.) in private companies get liquidity for shares they already own. ESOFund and 137 Ventures help with option financing, and provide loans (and exotic structures on loans) to cover costs of exercising options and any associated tax hit. EquityZen is a private company marketplace that led the second wave of VC-backed secondary markets starting early 2013. The mission is to help achieve liquidity for employees and other private company shareholder, but in a company-approved way. EquityZen transacts with share transfers and also a proprietary derivative structure which transfers economics of a company's shares without changing voting and information rights. This structure typically makes the transfer process cheaper and faster as less paperwork is involved. Accredited investors find the process appealing because they get access to companies they usually cannot with small check sizes. To address the questions in Dzt's post: 1). EquityZen doesn't take a 'loan shark' approach meaning they don't front shareholders money so that they can purchase their stock. With EquityZen, you\u2019re either selling your shares or selling all the economic risk\u2014upside and downside\u2014in exchange for today\u2019s value. 2). EquityZen only allows company approved deals on the platform. As a result, companies are more friendly towards the process and they tend to allow these deals to take place. Non-company approved deals pose risks for buyers and sellers and are ultimately unsustainable. As a buyer, without company blessing, you\u2019re taking on significant counterparty risk from the seller (will they make good on their promise to deliver shares in the future?) or the risk that the transfer is impermissible under relevant restrictions and your purchase is invalid. As a seller, you\u2019re running the risk of violating your equity agreements, which can have severe penalties, like forfeiture of your stock.")

['full',
 'disclosure',
 'i’m',
 'an',
 'intern',
 'for',
 'equityzen',
 'so',
 'i’m',
 'familiar',
 'with',
 'this',
 'space',
 'but',
 'can',
 'speak',
 'with',
 'the',
 'most',
 'accuracy',
 'about',
 'equityzen',
 'observations',
 'about',
 'other',
 'player',
 'in',
 'the',
 'space',
 'are',
 'my',
 'own',
 'the',
 'employee',
 'liquidity',
 'landscape',
 'is',
 'evolve',
 'equityzen',
 'and',
 'equidate',
 'help',
 'shareholder',
 'employee',
 'ex',
 'employee',
 'etc',
 'in',
 'private',
 'company',
 'get',
 'liquidity',
 'for',
 'shares',
 'they',
 'already',
 'own',
 'esofund',
 'and',
 '137',
 'venture',
 'help',
 'with',
 'option',
 'finance',
 'and',
 'provide',
 'loan',
 'and',
 'exotic',
 'structure',
 'on',
 'loan',
 'to',
 'cover',
 'costs',
 'of',
 'exercise',
 'option',
 'and',
 'any',
 'associate',
 'tax',
 'hit',
 'equityzen',
 'is',
 'a',
 'private',
 'company',
 'marketplace',
 'that',
 'led',
 'the',
 'second',
 'wave',
 'of',
 'vc',
 'back',
 'secondary',
 'mark

In [10]:
ms_server.retriever.bm25_ef.__dict__

{'analyzer': <pymilvus.model.sparse.bm25.tokenizers.Analyzer at 0x7f9868c2a7d0>,
 'corpus_size': 60811,
 'avgdl': 72.23837792504645,
 'idf': {"'m": [2.0128220287367498, 0],
  'say': [1.797401572056108, 1],
  "n't": [0.1865219196832566, 2],
  'like': [0.9311121477168811, 3],
  'idea': [2.848023106830336, 4],
  'on-the-job': [10.099210573874345, 5],
  'train': [4.522395491737962, 6],
  'ca': [2.4700275698441416, 7],
  'expect': [2.8805644115406857, 8],
  'compani': [1.5510075023376722, 9],
  'worker': [4.012435622472623, 10],
  'job': [2.4484981715298435, 11],
  're': [1.691193431156039, 12],
  'build': [3.340367960376983, 13],
  'softwar': [4.24732004115064, 14],
  'perhap': [3.836864468139309, 15],
  'educ': [3.8093723360753735, 16],
  'system': [2.9835833336405093, 17],
  'u.s.': [4.221620643739114, 18],
  'student': [3.8496774756908385, 19],
  'worri': [3.9274420457253942, 20],
  'littl': [2.935508578203997, 21],
  'get': [0.83418468523743, 22],
  'market': [1.7246147564601966, 23],


In [11]:
type(ms_server.retriever.bm25_ef.idf)

dict

In [12]:
rev_ff = []

In [13]:
def get_milvus_tokens(text, is_question=False, with_scores=False):
    global rev_ff
    ff=ms_server.retriever.bm25_ef
    if len(rev_ff)==0:
        rev_ff = [0] * len(ff.idf)
        for k, v in ff.idf.items():
            rev_ff[v[1]] = k
    if is_question:
        res=ff.encode_queries([text])
    else:
        res=ff.encode_documents([text])
    rr = []
    for i, v in zip(res.indices, res.data):
        rr.append([rev_ff[i], v] if with_scores else rev_ff[i])
    return rr
get_milvus_tokens("Quick Brown Foxes!")

['quick', 'brown', 'fox']

In [14]:
get_milvus_tokens("Full disclosure: I\u2019m an intern for EquityZen, so I\u2019m familiar with this space but can speak with the most accuracy about EquityZen. Observations about other players in the space are my own. The employee liquidity landscape is evolving. EquityZen and Equidate help shareholders (employees, ex-employees, etc.) in private companies get liquidity for shares they already own. ESOFund and 137 Ventures help with option financing, and provide loans (and exotic structures on loans) to cover costs of exercising options and any associated tax hit. EquityZen is a private company marketplace that led the second wave of VC-backed secondary markets starting early 2013. The mission is to help achieve liquidity for employees and other private company shareholder, but in a company-approved way. EquityZen transacts with share transfers and also a proprietary derivative structure which transfers economics of a company's shares without changing voting and information rights. This structure typically makes the transfer process cheaper and faster as less paperwork is involved. Accredited investors find the process appealing because they get access to companies they usually cannot with small check sizes. To address the questions in Dzt's post: 1). EquityZen doesn't take a 'loan shark' approach meaning they don't front shareholders money so that they can purchase their stock. With EquityZen, you\u2019re either selling your shares or selling all the economic risk\u2014upside and downside\u2014in exchange for today\u2019s value. 2). EquityZen only allows company approved deals on the platform. As a result, companies are more friendly towards the process and they tend to allow these deals to take place. Non-company approved deals pose risks for buyers and sellers and are ultimately unsustainable. As a buyer, without company blessing, you\u2019re taking on significant counterparty risk from the seller (will they make good on their promise to deliver shares in the future?) or the risk that the transfer is impermissible under relevant restrictions and your purchase is invalid. As a seller, you\u2019re running the risk of violating your equity agreements, which can have severe penalties, like forfeiture of your stock.")


["n't",
 'like',
 'compani',
 'get',
 'market',
 'exchang',
 'place',
 "'s",
 'good',
 'accredit',
 'investor',
 'small',
 'employe',
 'mean',
 'make',
 'sell',
 'equiti',
 '’',
 '1',
 'purchas',
 'valu',
 'relev',
 'less',
 'without',
 'futur',
 'deriv',
 'option',
 'etc',
 'involv',
 'deliv',
 'question',
 'also',
 'today',
 'take',
 'help',
 'risk',
 'cover',
 'result',
 'share',
 'stock',
 'seller',
 'buyer',
 'way',
 'typic',
 'cost',
 'transact',
 'associ',
 'size',
 'intern',
 'money',
 'post',
 'familiar',
 'inform',
 'provid',
 'chang',
 'approach',
 '2',
 'allow',
 'start',
 'hit',
 'second',
 'right',
 'usual',
 'liquid',
 'either',
 'financ',
 'alreadi',
 'access',
 'restrict',
 'deal',
 'tax',
 'sever',
 'penalti',
 'signific',
 'earli',
 'achiev',
 'address',
 'platform',
 'find',
 'run',
 'privat',
 'ventur',
 'check',
 'loan',
 'friend',
 'tend',
 'econom',
 'speak',
 'toward',
 'full',
 'transfer',
 'sharehold',
 'structur',
 'process',
 'disclosur',
 'exercis',
 'ulti

In [15]:
get_milvus_tokens("30% share in business", True)

['busi', 'share', '30']

In [16]:
get_tokens("30% share in business", True)

['30', 'share', 'business']

In [17]:
def compare_systems(text, is_question=False):
    print(f"Elastic: {sorted(get_tokens(text, is_question=is_question))}")
    print(f"Milvus:  {sorted(get_milvus_tokens(text, is_question=is_question))}")

In [18]:
compare_systems("Could an ex-employee of a company find themself stranded with shares they cannot sell (and a tax bill)?", is_question=True)

Elastic: ['bill', 'cannot', 'company', 'could', 'employee', 'ex', 'find', 'sell', 'shares', 'stranded', 'tax', 'themself']
Milvus:  ['bill', 'compani', 'could', 'ex-employe', 'find', 'sell', 'share', 'strand', 'tax', 'themself']


In [19]:
question="Better ways to invest money held by my small, privately-held Canadian corporation?"

In [20]:
compare_systems(question, is_question=False)

Elastic: ['better', 'by', 'canada', 'corporation', 'held', 'held', 'invest', 'money', 'my', 'private', 'small', 'to', 'ways']
Milvus:  ['better', 'canadian', 'corpor', 'held', 'invest', 'money', 'privately-held', 'small', 'way']
