In [1]:
#| default_exp 25_msmarco-substring

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#| export
import pandas as pd, ast, numpy as np, os, json, argparse
from typing import Optional, List, Dict
from tqdm.auto import tqdm
import scipy.sparse as sp

from sugar.core import *

## Helper functions

In [4]:
#| export
def save_raw(fname:str, ids:List, txt:List):
    df = pd.DataFrame({"identifier": ids, "text": txt})
    df.to_csv(fname, index=False)

def load_raw(fname:str):
    df = pd.read_csv(fname, keep_default_na=False, na_filter=False)
    return df["identifier"].tolist(), df["text"].tolist()
    

In [32]:
#| export
def convert_df_to_outputs(df):
    qry_outputs, lbl_outputs = dict(), dict()

    for i in tqdm(range(df.shape[0])):
        row = df.iloc[i]

        qry_id, lbl_id = row["id"], row["document_id"]
        query, label, substring = row["query"], row["document"], row["raw_model_response"]

        try:
            substring = ast.literal_eval(substring)
        except:
            raise ValueError(f"Error occurred while processing line {i}.")

        qry_outputs.setdefault(str(qry_id), {"id": qry_id, "query": query, "substring":[]})["substring"].extend(substring)
        lbl_outputs.setdefault(str(lbl_id), {"id": lbl_id, "document": label, "substring":[]})["substring"].extend(substring)

    return qry_outputs, lbl_outputs
    

In [33]:
#| export
def get_substring_matrix_from_outputs(outputs:Dict, ids:List, vocab:Optional[Dict]=None):
    if vocab is None: vocab = dict()

    data, indices, indptr = [], [], [0]
    for i in tqdm(ids, total=len(ids)):
        for c in outputs.get(i, {}).get("substring", []):
            idx = vocab.setdefault(c["original_substring"], len(vocab))
            indices.append(idx)
            data.append(1.0)
        indptr.append(len(indices))

    return sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    

In [7]:
#| export
def get_substring_info(outputs:Dict, meta_name:str):
    all_meta, substr2meta = dict(), dict()
    for key, value in tqdm(outputs.items(), total=len(outputs)):
        for o in value["substring"]:
            substr, metas = o["original_substring"], o[meta_name]
            meta_idx = [all_meta.setdefault(p, len(all_meta)) for p in metas]
            substr2meta.setdefault(substr, []).extend(meta_idx)
    return all_meta, substr2meta
    

In [8]:
#| export
def get_matrix_from_mapping(mapping:Dict, ids:List):
    data, indices, indptr = [], [], [0]
    for i in tqdm(ids, total=len(ids)):
        idx = mapping[i]
        indices.extend(idx)
        data.extend([1] * len(idx))
        indptr.append(len(data))
    return sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    

In [112]:
#| export
def get_intent_matrix_from_outputs(outputs:Dict, ids:List, vocab:Optional[Dict]=None, 
                                   for_labels:Optional[bool]=False):
    np.random.seed(1000)
    if vocab is None: vocab = dict()
        
    data, indices, indptr = [], [], [0]
    for i in tqdm(ids, total=len(ids)):
        for c in outputs.get(i, {}).get("substring", []):

            if for_labels:
                for o in c["intent_phrases"]:
                    if o in vocab:
                        intent = o
                        break
                else:
                    intent = str(np.random.choice(c["intent_phrases"]))
            else:
                intent = str(np.random.choice(c["intent_phrases"]))
            
            idx = vocab.setdefault(intent, len(vocab))
            indices.append(idx)
            data.append(1.0)
        indptr.append(len(indices))

    return sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    

In [67]:
#| export
def save_substring(trn_substr:sp.csr_matrix, tst_substr:sp.csr_matrix, lbl_substr:sp.csr_matrix,
                   all_substr:Dict, save_dir:str, substr_name:str):
    n_trn_substr = trn_substr.shape[1]
    nnz = tst_substr[:, :n_trn_substr].getnnz(axis=1)
    print(f"% of training {substr_name} in test: {np.where(nnz != 0)[0].shape[0]/tst_substr.shape[0]:.4f}")

    nnz = sum(lbl_substr.getnnz(axis=1) > 0)
    print(f"# of document with substrings metadata: {nnz}/{lbl_substr.shape[0]}")
    
    trn_substr.sum_duplicates()
    trn_substr.sort_indices()
    
    tst_substr.sum_duplicates()
    tst_substr.sort_indices()

    lbl_substr.sum_duplicates()
    lbl_substr.sort_indices()

    # save substrings
    all_substr_txt = sorted(all_substr, key=lambda x: all_substr[x])
    all_substr_ids = list(range(len(all_substr)))
    
    os.makedirs(save_dir, exist_ok=True)
    n_trn_substr = trn_substr.shape[1]
    
    sp.save_npz(f"{save_dir}/{substr_name}_trn_X_Y.npz", trn_substr)
    
    sp.save_npz(f"{save_dir}/{substr_name}_tst_X_Y.npz", tst_substr[:, :n_trn_substr])
    sp.save_npz(f"{save_dir}/all-{substr_name}_tst_X_Y.npz", tst_substr)

    sp.save_npz(f"{save_dir}/{substr_name}_lbl_X_Y.npz", lbl_substr[:, :n_trn_substr])
    sp.save_npz(f"{save_dir}/all-{substr_name}_lbl_X_Y.npz", lbl_substr)
    
    os.makedirs(f"{save_dir}/raw_data/", exist_ok=True)
    save_raw(f"{save_dir}/raw_data/{substr_name}.raw.csv", all_substr_ids[:n_trn_substr], all_substr_txt[:n_trn_substr])
    save_raw(f"{save_dir}/raw_data/all-{substr_name}.raw.csv", all_substr_ids, all_substr_txt)
    

In [68]:
#| export
def extract_and_save_substring_metadata(outputs:Dict, save_dir:str, meta_name:str, replace_substr_with_metadata:Optional[bool]=False):
    # extract metadata
    all_metas, substr2meta = get_substring_info(outputs, meta_name)
    print(f"Total metadata: {len(all_metas)}")
    
    substr_ids, substr_txt = load_raw(f"{save_dir}/raw_data/substring.raw.csv")
    all_substr_ids, all_substr_txt = load_raw(f"{save_dir}/raw_data/all-substring.raw.csv")
    
    substr_meta = get_matrix_from_mapping(substr2meta, all_substr_txt)

    # save metadata
    meta_txt = sorted(all_metas, key=lambda x: all_metas[x])
    meta_ids = list(range(len(meta_txt)))
    
    n_trn_substr = len(substr_txt)
    
    substr_meta.sum_duplicates()
    substr_meta.sort_indices()

    meta_name = meta_name.replace("_", "-")
    sp.save_npz(f"{save_dir}/{meta_name}_substring_X_Y.npz", substr_meta[:n_trn_substr])
    sp.save_npz(f"{save_dir}/{meta_name}_all-substring_X_Y.npz", substr_meta)
    
    save_raw(f"{save_dir}/raw_data/substring_{meta_name}.raw.csv", meta_ids, meta_txt)

    if replace_substr_with_metadata:
        idx = np.array([np.random.choice(substr_meta.indices[i:j]) for i,j in zip(substr_meta.indptr, substr_meta.indptr[1:])])
        substr_meta_ids, substr_meta_txt = [meta_ids[i] for i in idx], [meta_txt[i] for i in idx]

        save_raw(f"{save_dir}/raw_data/substring-{meta_name}.raw.csv", substr_meta_ids[:n_trn_substr], substr_meta_txt[:n_trn_substr])
        save_raw(f"{save_dir}/raw_data/all-substring-{meta_name}.raw.csv", substr_meta_ids, substr_meta_txt)
    

## Load data

In [7]:
meta_name = "intent_substring"
raw_dir = "/Users/suchith720/Desktop/Projects/mogicX/"

out_file = f"{raw_dir}/data/gpt_substring/{meta_name}/msmarco_{meta_name}.csv"

In [8]:
data_dir = "/Users/suchith720/Projects/"
trn_ids, trn_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/train.raw.txt")
tst_ids, tst_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/test.raw.txt")
lbl_ids, lbl_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/label.raw.txt")

In [9]:
df = pd.read_csv(out_file)

In [10]:
lbl_txt2ids = {v:k for k,v in zip(lbl_ids, lbl_txt)}

In [11]:
df["document_id"] = [lbl_txt2ids[o] for o in df["document"]]

In [12]:
df.head()

Unnamed: 0,ModelLabelingFlowVersionID,HitUid,JudgmentUid,HitState,JudgmentSubmitTime,Debug,HitID,id,query,document,JudgmentID,raw_model_response,document_id
0,32997,BNBj5BJfVdggebIttGYdhQ==,fdc6cb54-7bbf-455b-9241-acf979b1baad,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029040,1185869,)what was the immediate impact of the success ...,The presence of communication amid scientific ...,1804029040,"[ {""original_substring"":""hundreds of thousand...",0
1,32997,ppBthCWnQ0+mtzA57WZehg==,9ac2acaa-6113-4bc1-8ee7-c2c932c53c0e,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029041,1185868,_________ justice is designed to repair the ha...,The approach is based on a theory of justice t...,1804029041,"[ {""original_substring"":""Restorative justice""...",16
2,32997,1OD7ZOmFoc9sz6VpYKlGCA==,6bdf6c6c-dfec-4b6d-af09-e6d9f4263bfe,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029042,597651,what color is amber urine,"Colorâurine can be a variety of colors, most...",1804029042,"[ {""original_substring"":""shades of yellow"",""d...",49
3,32997,O7wKe6vk48gsfQBU9oNa9A==,a73332aa-82e0-461c-93a3-44706d0420bb,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029043,403613,is autoimmune hepatitis a bile acid synthesis ...,Inborn errors of bile acid synthesis can produ...,1804029043,"[ {""original_substring"":""inborn errors of bil...",60
4,32997,wby5jG2JHgVb3q7E94U4pg==,9790f4a6-a3d4-4f86-89b1-011cd601312b,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029044,1183785,elegxo meaning,The word convict here (elegcw /elegxo) means t...,1804029044,"[ {""original_substring"":""bring to light"",""der...",389


In [35]:
qry_outputs, lbl_outputs = convert_df_to_outputs(df)

  0%|          | 0/540188 [00:00<?, ?it/s]



In [36]:
len(qry_outputs), len(lbl_outputs)

(509919, 523596)

## `Substrings`

In [27]:
all_substr = dict()

In [24]:
trn_substr = get_substring_matrix_from_outputs(outputs, trn_ids, all_substr)

  0%|          | 0/502939 [00:00<?, ?it/s]

In [25]:
print(f"Total concepts: {len(all_substr)}")

Total concepts: 905273


In [26]:
tst_substr = get_substring_matrix_from_outputs(qry_outputs, tst_ids, all_substr)

  0%|          | 0/6980 [00:00<?, ?it/s]

In [27]:
print(f"Total concepts: {len(all_substr)}")

Total concepts: 917124


In [28]:
n_qry_substr = len(all_substr)

In [29]:
lbl_substr = get_substring_matrix_from_outputs(lbl_outputs, lbl_ids, all_substr)

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [53]:
assert n_qry_substr == len(all_substr), "Substring count mismatch: Extra substrings were added from labels, which is not allowed."

In [206]:
n_trn_substr = trn_substr.shape[1]
nnz = tst_substr[:, :n_trn_substr].getnnz(axis=1)
print(f"% of training substrings in test: {np.where(nnz != 0)[0].shape[0]/tst_substr.shape[0]:.4f}")

% of training substrings in test: 0.3211


In [36]:
nnz = sum(lbl_substr.getnnz(axis=1) > 0)
print(f"# of document with substrings metadata: {nnz}/{lbl_substr.shape[0]}")

# of document with substrings metadata: 513006/8841823


In [37]:
trn_substr.sum_duplicates()
tst_substr.sum_duplicates()
lbl_substr.sum_duplicates()

trn_substr.sort_indices()
tst_substr.sort_indices()
lbl_substr.sort_indices()

#### Save concepts

__Naming convention__

1. `concept-substring_trn_X_Y.npz`
2. `concept-substring_tst_X_Y.npz`
3. `all-concept-substring_tst_X_Y.npz`
4. `concept-substring.raw.csv`
5. `all-concept-substring.raw.csv`

In [208]:
all_substr_txt = sorted(all_substr, key=lambda x: all_substr[x])
all_substr_ids = list(range(len(all_substr)))

In [51]:
save_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"
os.makedirs(save_dir, exist_ok=True)

In [52]:
n_trn_substr = trn_substr.shape[1]

sp.save_npz(f"{save_dir}/substring_trn_X_Y.npz", trn_substr)
sp.save_npz(f"{save_dir}/substring_tst_X_Y.npz", tst_substr[:, :n_trn_substr])
sp.save_npz(f"{save_dir}/substring_lbl_X_Y.npz", lbl_substr[:, :n_trn_substr])

sp.save_npz(f"{save_dir}/all-substring_tst_X_Y.npz", tst_substr)
sp.save_npz(f"{save_dir}/all-substring_lbl_X_Y.npz", lbl_substr)

In [139]:
os.makedirs(f"{save_dir}/raw_data/", exist_ok=True)

In [212]:
save_raw(f"{save_dir}/raw_data/substring.raw.csv", all_substr_ids[:n_trn_substr], all_substr_txt[:n_trn_substr])
save_raw(f"{save_dir}/raw_data/all-substring.raw.csv", all_substr_ids, all_substr_txt)

## `Derived phrases`

In [54]:
all_phrases, substr2phrases = get_substring_info(qry_outputs, "derived_phrases")

  0%|          | 0/509919 [00:00<?, ?it/s]

In [55]:
print(f"Total phrases: {len(all_phrases)}")

Total phrases: 8926790


In [56]:
data_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"
substr_ids, substr_txt = load_raw(f"{data_dir}/raw_data/substring.raw.csv")
all_substr_ids, all_substr_txt = load_raw(f"{data_dir}/raw_data/all-substring.raw.csv")

In [57]:
substr_phrases = get_matrix_from_mapping(substr2phrases, all_substr_txt)

  0%|          | 0/1678116 [00:00<?, ?it/s]

In [227]:
phrase_txt = sorted(all_phrases, key=lambda x: all_phrases[x])
phrase_ids = list(range(len(phrase_txt)))

#### Save phrases

__Naming convention__

1. `derived-phrases_substring_X_Y.npz`
2. `derived-phrases_all-substring_X_Y.npz`
3. `derived-phrases.raw.csv`

In [58]:
save_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"

In [229]:
n_trn_substr = len(substr_txt)
print(f"Number of train concepts: {n_trn_substr}")

Number of train concepts: 1009097


In [230]:
substr_phrases.sum_duplicates()
substr_phrases.sort_indices()

In [231]:
sp.save_npz(f"{save_dir}/derived-phrases_substring_X_Y.npz", substr_phrases[:n_trn_substr])
sp.save_npz(f"{save_dir}/derived-phrases_all-substring_X_Y.npz", substr_phrases)

In [232]:
save_raw(f"{save_dir}/raw_data/derived-phrases.raw.csv", phrase_ids, phrase_txt)

## `Intent phrases`

In [88]:
all_intents, substr2intents = get_substring_info(qry_outputs, "intent_phrases")

  0%|          | 0/509919 [00:00<?, ?it/s]

In [89]:
print(f"Total intents: {len(all_intents)}")

Total intents: 5757643


In [90]:
data_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"

substr_ids, substr_txt = load_raw(f"{data_dir}/raw_data/substring.raw.csv")
all_substr_ids, all_substr_txt = load_raw(f"{data_dir}/raw_data/all-substring.raw.csv")

In [91]:
substr_intents = get_matrix_from_mapping(substr2intents, all_substr_txt)

  0%|          | 0/917124 [00:00<?, ?it/s]

  0%|          | 0/917124 [00:00<?, ?it/s]

In [92]:
intent_txt = sorted(all_intents, key=lambda x: all_intents[x])
intent_ids = list(range(len(intent_txt)))

In [103]:
all_intents = dict()

In [104]:
trn_intent = get_intent_matrix_from_outputs(qry_outputs, trn_ids, all_intents)

  0%|          | 0/502939 [00:00<?, ?it/s]

In [105]:
tst_intent = get_intent_matrix_from_outputs(qry_outputs, tst_ids, all_intents)

  0%|          | 0/6980 [00:00<?, ?it/s]

In [106]:
n_qry_intent = len(all_intents)

In [107]:
lbl_intent = get_intent_matrix_from_outputs(lbl_outputs, lbl_ids, all_intents, for_labels=True)

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [108]:
assert n_qry_intent == len(all_intents), "Intent-substring count mismatch: Extra substrings were added from labels, which is not allowed."

In [373]:
n_trn_intent = trn_intent.shape[1]
nnz = tst_intent[:, :n_trn_intent].getnnz(axis=1)
print(f"% of training intents in test: {np.where(nnz != 0)[0].shape[0]/trn_intent.shape[0]:.4f}")

% of training intents in test: 0.0004


In [51]:
nnz = sum(lbl_intent.getnnz(axis=1) > 0)
print(f"# of document with substrings metadata: {nnz}/{lbl_intent.shape[0]}")

# of document with substrings metadata: 513006/8841823


In [55]:
trn_intent.sum_duplicates()
tst_intent.sum_duplicates()
lbl_intent.sum_duplicates()

trn_intent.sort_indices()
tst_intent.sort_indices()
lbl_intent.sort_indices()

In [56]:
all_intent_txt = sorted(all_intents, key=lambda x: all_intents[x])
all_intent_ids = list(range(len(all_intents)))

In [57]:
save_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"
os.makedirs(save_dir, exist_ok=True)

In [58]:
n_trn_intent = trn_intent.shape[1]

sp.save_npz(f"{save_dir}/intent_trn_X_Y.npz", trn_intent)
sp.save_npz(f"{save_dir}/intent_tst_X_Y.npz", tst_intent[:, :n_trn_intent])
sp.save_npz(f"{save_dir}/intent_lbl_X_Y.npz", lbl_intent[:, :n_trn_intent])

sp.save_npz(f"{save_dir}/all-intent_tst_X_Y.npz", tst_intent)
sp.save_npz(f"{save_dir}/all-intent_lbl_X_Y.npz", lbl_intent)

In [60]:
os.makedirs(f"{save_dir}/raw_data/", exist_ok=True)

In [61]:
save_raw(f"{save_dir}/raw_data/intent.raw.csv", all_intent_ids[:n_trn_intent], all_intent_txt[:n_trn_intent])
save_raw(f"{save_dir}/raw_data/all-intent.raw.csv", all_intent_ids, all_intent_txt)

## `Driver`

In [109]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--meta_name', type=str)
    return parser.parse_known_args()[0]
    

In [64]:
#| export
if __name__ == "__main__":
    input_args = parse_args()
    
    # arguements
    meta_name = input_args.meta_name
    raw_dir = "/Users/suchith720/Desktop/Projects/mogicX/"
    save_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/{meta_name}/"
    
    # load raw data
    data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC"
    trn_ids, trn_txt = load_raw_file(f"{data_dir}/raw_data/train.raw.txt")
    tst_ids, tst_txt = load_raw_file(f"{data_dir}/raw_data/test.raw.txt")
    
    lbl_ids, lbl_txt = load_raw_file(f"{data_dir}/raw_data/label.raw.txt")
    lbl_txt2ids = {v:k for k,v in zip(lbl_ids, lbl_txt)}
    
    df = pd.read_csv(f"{raw_dir}/data/gpt_substring/{meta_name}/msmarco_{meta_name}.csv")
    df["document_id"] = [lbl_txt2ids[o] for o in df["document"]]
    
    qry_outputs, lbl_outputs = convert_df_to_outputs(df)
    
    # extract substrings
    all_substr = dict()
    
    trn_substr = get_substring_matrix_from_outputs(qry_outputs, trn_ids, all_substr)
    print(f"Total train substrings: {len(all_substr)}")
    
    tst_substr = get_substring_matrix_from_outputs(qry_outputs, tst_ids, all_substr)
    print(f"Total test substrings: {len(all_substr)}")

    n_qry_substr = len(all_substr)
    lbl_substr = get_substring_matrix_from_outputs(lbl_outputs, lbl_ids, all_substr)
    assert n_qry_substr == len(all_substr), "Substring count mismatch: Extra substrings were added from labels, which is not allowed."

    save_substring(trn_substr, tst_substr, lbl_substr, all_substr, save_dir, "substring")

    # substring metadata
    extract_and_save_substring_metadata(qry_outputs, save_dir, "derived_phrases")

    output_keys = set(qry_outputs[next(iter(qry_outputs))]["substring"][0].keys())
    if "intent_phrases" in output_keys:
        all_intent = dict()
        
        trn_intent = get_intent_matrix_from_outputs(qry_outputs, trn_ids, all_intent)
        print(f"Total train intents: {len(all_intent)}")
        
        tst_intent = get_intent_matrix_from_outputs(qry_outputs, tst_ids, all_intent)
        print(f"Total test intents: {len(all_intent)}")

        n_qry_intent = len(all_intent)
        lbl_intent = get_intent_matrix_from_outputs(lbl_outputs, lbl_ids, all_intent, for_labels=True)
        assert n_qry_intent == len(all_intent), "Intent-substring count mismatch: Extra substrings were added from labels, which is not allowed."

        save_substring(trn_intent, tst_intent, lbl_intent, all_intent, save_dir, "intent")
        
        extract_and_save_substring_metadata(qry_outputs, save_dir, "intent_phrases", replace_substr_with_metadata=True)


## `Display examples`

In [32]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC"

trn_ids, trn_txt = load_raw_file(f"{data_dir}/raw_data/train.raw.txt")
tst_ids, tst_txt = load_raw_file(f"{data_dir}/raw_data/test.raw.txt")

In [393]:
np.random.seed(1000)
rnd_idx = [10, 100, 1000, 10_000, 100_000] + np.random.permutation(trn_substr.shape[0])[:10].tolist()

In [396]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC/narrow_substring/"

trn_substr = sp.load_npz(f"{data_dir}/substring_trn_X_Y.npz")
substr_ids, substr_txt = load_raw(f"{data_dir}/raw_data/substring.raw.csv")

In [401]:
for idx in rnd_idx:
    print(f"Query: {trn_txt[idx]}")
    substr = ", ".join([substr_txt[i] for i in trn_substr[idx].indices])
    print(f"Substring: {substr}", end="\n\n")
    

Query: most dependable affordable cars
Substring: Suzuki SX4, most reliable and affordable all-wheel-drive cars

Query: temperature sorrento italy september
Substring: average daily temperature is 21Â°C

Query: meaning of adherence
Substring: related to the verb adhere, meaning “to stick”, a person who adheres to a plan

Query: how many people has trump fired in his administration?
Substring: fired 3 officials

Query: what is the sales tax for leander, tx
Substring: 8.250%

Query: wadesboro population
Substring: 5,711

Query: how long does it take for sperm to die once it hits air
Substring: up to three hours in the open air

Query: what is the akc
Substring: in the United States, registry of purebred dog pedigrees

Query: what is wrong with your eyes when they are very itchy with contacts and you start seeing blurry
Substring: dry eye syndrome, eyes are dry, eyes are irritated by the contacts

Query: histone meaning
Substring: soluble in water, a simple protein, insoluble in dilute am

In [402]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC/intent_substring/"

trn_intent = sp.load_npz(f"{data_dir}/intent_trn_X_Y.npz")
intent_ids, intent_txt = load_raw(f"{data_dir}/raw_data/intent.raw.csv")

In [403]:
for idx in rnd_idx:
    print(f"Query: {trn_txt[idx]}")
    intent = ", ".join([intent_txt[i] for i in trn_intent[idx].indices])
    print(f"Intent: {intent}", end="\n\n")
    

Query: most dependable affordable cars
Intent: Suzuki SX4 is a dependable and low-cost vehicle

Query: temperature sorrento italy september
Intent: average September day temperature in Sorrento is 21°C

Query: meaning of adherence
Intent: the meaning of adherence is to stick

Query: how many people has trump fired in his administration?
Intent: Trump has fired three officials connected to his administration

Query: what is the sales tax for leander, tx
Intent: Leander, TX currently charges 8.250% sales tax

Query: wadesboro population
Intent: Wadesboro's population is 5,711 people

Query: how long does it take for sperm to die once it hits air
Intent: in ideal conditions sperm survives up to three hours

Query: what is the akc
Intent: AKC serves as a purebred dog pedigree registry, AKC functions within the United States

Query: what is wrong with your eyes when they are very itchy with contacts and you start seeing blurry
Intent: eye dryness can lead to blurry vision when wearing conta

In [20]:
np.random.seed(1000)
idxs = np.random.choice(list(outputs.keys()), size=10)

In [31]:
for i in idxs:
    o = outputs[i]
    example = {
        "Query": o["query"],
        "Document": o["document"],
        "Intent": [np.random.choice(x["intent_phrases"]) for x in o["substring"]],
    }
    print(json.dumps(example, indent=4))

{
    "Query": "is lamb a baby sheep",
    "Document": "There are no other names for a baby lamb because baby sheep are called lambs. A lamb is a young sheep that is less than 14-months-old, so newborn sheep are called lambs.",
    "Intent": [
        "baby sheep go by the name lamb",
        "lamb describes a young sheep",
        "lambs are sheep less than 14 months in age"
    ]
}
{
    "Query": "what kind of beauty is lord byron defining in she walks in beauty",
    "Document": "She Walks in Beauty Analysis. 1  In the first line of She Walks in Beauty, Lord Byron compares a woman's beauty to the night. He uses a simile to liken her beauty to that of cloudless climes and starry skies, emphasizing the clarity and the brightness of her beauty. She Walks in Beauty is a short poem consisting of three stanzas of six lines each.",
    "Intent": [
        "She Walks in Beauty opens by comparing her beauty to the night",
        "the poem likens her beauty to cloudless climes and starry ski