In [1]:
#| default_exp 27_msmarco-intent-substring

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#| export
import pandas as pd, ast, numpy as np, os, json, argparse
from typing import Optional, List, Dict
from tqdm.auto import tqdm
import scipy.sparse as sp

from sugar.core import *

## Helper functions

In [4]:
#| export
def save_raw(fname:str, ids:List, txt:List):
    df = pd.DataFrame({"identifier": ids, "text": txt})
    df.to_csv(fname, index=False)

def load_raw(fname:str):
    df = pd.read_csv(fname, keep_default_na=False, na_filter=False)
    return df["identifier"].tolist(), df["text"].tolist()
    

In [5]:
#| export
def convert_df_to_query_dict(df):
    outputs = dict()
    
    for i in tqdm(range(df.shape[0])):
        row = df.iloc[i]
        
        query_id, label_id = row["id"], row["document_id"]
        query, label, substring = row["query"], row["document"], row["raw_model_response"]
        
        try:
            substring = ast.literal_eval(substring)
        except:
            raise ValueError(f"Error occurred while processing line {i}.")

        o = {
            "query_id": query_id, 
            "query_text": query,
            "label_id": [],
            "label_text": [],
            "substring":[],
        }
        
        o = outputs.setdefault(str(query_id), o)
        o["substring"].append(substring)
        o["label_id"].append(label_id)
        o["label_text"].append(label)
        
    return outputs
    

In [51]:
#| export
def get_substring_matrix_from_query_dict(outputs:Dict, query_ids:List, vocab:Optional[Dict]=None, lbl_intent:Optional[Dict]=None, 
                                         phrases:Optional[Dict]=None, intents:Optional[Dict]=None, substrs:Optional[Dict]=None, 
                                         seed:Optional[int]=None):
    
    if seed is not None: np.random.seed(seed)

    if vocab is None: vocab = dict()
    
    if phrases is None: phrases = dict()
    if intents is None: intents = dict()
    if substrs is None: substrs = dict()
    if lbl_intent is None: lbl_intent = dict()
        
    data, indices, indptr = [], [], [0]
    for q in tqdm(query_ids, total=len(query_ids)):
        o = outputs.get(q, {})

        generations = o.get("substring", [])
        lbl_ids = o.get("label_id", [])

        assert len(generations) == len(lbl_ids)
        
        for gens, l in zip(generations, lbl_ids):
            for s in gens:
                intent = np.random.choice(s["intent_phrases"])
                
                idx = vocab.setdefault(intent, len(vocab))
                indices.append(idx)
                data.append(1.0)

                lbl_intent.setdefault(l, []).append(idx)
                phrases.setdefault(idx, []).extend(s["derived_phrases"])
                intents.setdefault(idx, []).extend(s["intent_phrases"])
                substrs.setdefault(idx, []).append(s["original_substring"])
                
        indptr.append(len(indices))

    qry_intent = sp.csr_matrix((data, indices, indptr), dtype=np.float32)

    return qry_intent, vocab, lbl_intent, phrases, intents, substrs
    

In [7]:
#| export
def get_intent_matrix_from_dict(intents:Dict, ids:List):
    data, indices, indptr = [], [], [0]
    for i in tqdm(ids):
        idxs = intents.get(i, [])
        indices.extend(idxs)
        data.extend([1.0] * len(idxs))
        indptr.append(len(data))
    return sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    

In [40]:
#| export
def get_metadata_matrix_from_dict(metadata:Dict, ids:List):
    vocab, data, indices, indptr = dict(), [], [], [0]
    for i in tqdm(ids):
        indices.extend([vocab.setdefault(o, len(vocab)) for o in metadata[i]])
        data.extend([1.0] * len(metadata[i]))
        indptr.append(len(indices))
    return sp.csr_matrix((data, indices, indptr), dtype=np.float32), vocab
    

In [84]:
#| export
def extract_intents(outputs:Dict, trn_ids:List, tst_ids:List, lbl_ids:List, seed:Optional[int]=100):
    o = get_substring_matrix_from_query_dict(outputs, trn_ids, seed=seed)
    trn_intent, intent_vocab, lbl_intent, phrases, intents, substrs = o

    
    o = get_substring_matrix_from_query_dict(outputs, tst_ids, vocab=intent_vocab, lbl_intent=lbl_intent, 
                                             phrases=phrases, intents=intents, substrs=substrs, seed=seed)
    tst_intent, intent_vocab, lbl_intent, phrases, intents, substrs = o

    lbl_intent = get_intent_matrix_from_dict(lbl_intent, lbl_ids)

    intent_txt = list(map(str, sorted(intent_vocab, key=lambda x:intent_vocab[x])))
    intent_ids = list(range(len(intent_txt)))

    intent_phrase, phrase_vocab = get_metadata_matrix_from_dict(phrases, intent_ids)
    intent_intents, intents_vocab = get_metadata_matrix_from_dict(intents, intent_ids)
    intent_substr, substr_vocab = get_metadata_matrix_from_dict(substrs, intent_ids)

    return (trn_intent, tst_intent, lbl_intent), (intent_ids, intent_txt), (intent_phrase, phrase_vocab), (intent_intents, intents_vocab), (intent_substr, substr_vocab)
    

In [76]:
#| export
def save_intents(save_dir:str, trn_intent:sp.csr_matrix, tst_intent:sp.csr_matrix, lbl_intent:sp.csr_matrix,
                 intent_ids:Dict, intent_txt:str):
    
    n_trn_intent = trn_intent.shape[1]
    
    nnz = tst_intent[:, :n_trn_intent].getnnz(axis=1)
    print(f"% of training intents in test: {100 * np.where(nnz != 0)[0].shape[0]/tst_intent.shape[0]:.4f}")

    nnz = sum(lbl_intent.getnnz(axis=1) > 0)
    print(f"# of document with intent metadata: {nnz}/{lbl_intent.shape[0]}")
    
    trn_intent.sum_duplicates()
    trn_intent.sort_indices()
    
    tst_intent.sum_duplicates()
    tst_intent.sort_indices()

    lbl_intent.sum_duplicates()
    lbl_intent.sort_indices()
    
    os.makedirs(save_dir, exist_ok=True)
    
    sp.save_npz(f"{save_dir}/intent_trn_X_Y.npz", trn_intent)
    
    sp.save_npz(f"{save_dir}/intent_tst_X_Y.npz", tst_intent[:, :n_trn_intent])
    sp.save_npz(f"{save_dir}/all-intent_tst_X_Y.npz", tst_intent)

    sp.save_npz(f"{save_dir}/intent_lbl_X_Y.npz", lbl_intent[:, :n_trn_intent])
    sp.save_npz(f"{save_dir}/all-intent_lbl_X_Y.npz", lbl_intent)
    
    os.makedirs(f"{save_dir}/raw_data/", exist_ok=True)
    save_raw(f"{save_dir}/raw_data/intent.raw.csv", intent_ids[:n_trn_intent], intent_txt[:n_trn_intent])
    save_raw(f"{save_dir}/raw_data/all-intent.raw.csv", intent_ids, intent_txt)
    

In [77]:
#| export
def save_intent_metadata(save_dir:str, intent_phrase:sp.csr_matrix, phrase_vocab:Dict, intent_intents:sp.csr_matrix, 
                         intents_vocab:Dict, intent_substr:sp.csr_matrix, substr_vocab:Dict):
    
    intent_phrase.sum_duplicates()
    intent_phrase.sort_indices()
    
    intent_intents.sum_duplicates()
    intent_intents.sort_indices()

    intent_substr.sum_duplicates()
    intent_substr.sort_indices()

    os.makedirs(save_dir, exist_ok=True)

    sp.save_npz(f"{save_dir}/derived-substring_all-intent_X_Y.npz", intent_phrase)
    sp.save_npz(f"{save_dir}/derived-intent_all-intent_X_Y.npz", intent_intents)
    sp.save_npz(f"{save_dir}/substring_all-intent_X_Y.npz", intent_substr)

    def get_raw_info(vocab):
        return list(range(len(vocab))), list(map(str, sorted(vocab, key=lambda x:vocab[x])))

    os.makedirs(f"{save_dir}/raw_data/", exist_ok=True)
    
    ids, txt = get_raw_info(phrase_vocab)
    save_raw(f"{save_dir}/raw_data/all-intent_derived-substring.raw.csv", ids, txt)

    ids, txt = get_raw_info(intents_vocab)
    save_raw(f"{save_dir}/raw_data/all-intent_derived-intent.raw.csv", ids, txt)

    ids, txt = get_raw_info(substr_vocab)
    save_raw(f"{save_dir}/raw_data/all-intent_substring.raw.csv", ids, txt)
    

## Load data

In [9]:
meta_name = "intent_substring"
raw_dir = "/Users/suchith720/Desktop/Projects/mogicX/"

out_file = f"{raw_dir}/data/gpt_substring/{meta_name}/msmarco_{meta_name}.csv"

In [10]:
data_dir = "/Users/suchith720/Projects/"
trn_ids, trn_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/train.raw.txt")
tst_ids, tst_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/test.raw.txt")
lbl_ids, lbl_txt = load_raw_file(f"{data_dir}/data/beir/msmarco/XC/raw_data/label.raw.txt")

In [11]:
df = pd.read_csv(out_file)

In [12]:
lbl_txt2ids = {v:k for k,v in zip(lbl_ids, lbl_txt)}

In [13]:
df["document_id"] = [lbl_txt2ids[o] for o in df["document"]]

In [14]:
df.head()

Unnamed: 0,ModelLabelingFlowVersionID,HitUid,JudgmentUid,HitState,JudgmentSubmitTime,Debug,HitID,id,query,document,JudgmentID,raw_model_response,document_id
0,32997,BNBj5BJfVdggebIttGYdhQ==,fdc6cb54-7bbf-455b-9241-acf979b1baad,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029040,1185869,)what was the immediate impact of the success ...,The presence of communication amid scientific ...,1804029040,"[ {""original_substring"":""hundreds of thousand...",0
1,32997,ppBthCWnQ0+mtzA57WZehg==,9ac2acaa-6113-4bc1-8ee7-c2c932c53c0e,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029041,1185868,_________ justice is designed to repair the ha...,The approach is based on a theory of justice t...,1804029041,"[ {""original_substring"":""Restorative justice""...",16
2,32997,1OD7ZOmFoc9sz6VpYKlGCA==,6bdf6c6c-dfec-4b6d-af09-e6d9f4263bfe,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029042,597651,what color is amber urine,"Colorâurine can be a variety of colors, most...",1804029042,"[ {""original_substring"":""shades of yellow"",""d...",49
3,32997,O7wKe6vk48gsfQBU9oNa9A==,a73332aa-82e0-461c-93a3-44706d0420bb,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029043,403613,is autoimmune hepatitis a bile acid synthesis ...,Inborn errors of bile acid synthesis can produ...,1804029043,"[ {""original_substring"":""inborn errors of bil...",60
4,32997,wby5jG2JHgVb3q7E94U4pg==,9790f4a6-a3d4-4f86-89b1-011cd601312b,0,12/26/2025 12:00:07 AM,"{""Steps"":[{""i"":0,""Model"":""gpt-5-chat-2025-08-0...",1804029044,1183785,elegxo meaning,The word convict here (elegcw /elegxo) means t...,1804029044,"[ {""original_substring"":""bring to light"",""der...",389


In [15]:
outputs = convert_df_to_query_dict(df)

  0%|          | 0/540188 [00:00<?, ?it/s]



In [17]:
len(outputs)

509919

## `Intents`

In [52]:
o = get_substring_matrix_from_query_dict(outputs, trn_ids, seed=100)
trn_intent, intent_vocab, lbl_intent, phrases, intents, substrs = o

  0%|          | 0/502939 [00:00<?, ?it/s]

In [53]:
print(f"Total concepts: {len(intent_vocab)}")

Total concepts: 1211757


In [54]:
o = get_substring_matrix_from_query_dict(outputs, tst_ids, vocab=intent_vocab, lbl_intent=lbl_intent, 
                                         phrases=phrases, intents=intents, substrs=substrs, seed=100)
tst_intent, intent_vocab, lbl_intent, phrases, intents, substrs = o

  0%|          | 0/6980 [00:00<?, ?it/s]

In [55]:
print(f"Total concepts: {len(intent_vocab)}")

Total concepts: 1228913


In [56]:
lbl_intent = get_intent_matrix_from_dict(lbl_intent, lbl_ids)

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [57]:
intent_txt = list(map(str, sorted(intent_vocab, key=lambda x:intent_vocab[x])))
intent_ids = list(range(len(intent_txt)))

In [58]:
intent_phrase, phrase_vocab = get_metadata_matrix_from_dict(phrases, intent_ids)

  0%|          | 0/1228913 [00:00<?, ?it/s]

In [59]:
intent_intents, intents_vocab = get_metadata_matrix_from_dict(intents, intent_ids)

  0%|          | 0/1228913 [00:00<?, ?it/s]

In [60]:
intent_substr, substr_vocab = get_metadata_matrix_from_dict(substrs, intent_ids)

  0%|          | 0/1228913 [00:00<?, ?it/s]

In [62]:
n_trn_intent = trn_intent.shape[1]
nnz = tst_intent[:, :n_trn_intent].getnnz(axis=1)
print(f"% of training intents in test: {100 * np.where(nnz != 0)[0].shape[0]/tst_intent.shape[0]:.4f}")

% of training intents in test: 2.3782


In [63]:
nnz = sum(lbl_intent.getnnz(axis=1) > 0)
print(f"# of document with substrings metadata: {nnz}/{lbl_intent.shape[0]}")

# of document with substrings metadata: 513006/8841823


In [64]:
trn_intent.sum_duplicates()
trn_intent.sort_indices()

tst_intent.sum_duplicates()
tst_intent.sort_indices()

lbl_intent.sum_duplicates()
lbl_intent.sort_indices()

#### Save intents

__Naming convention__

1. `intent_trn_X_Y.npz`
2. `intent_tst_X_Y.npz`
3. `intent_lbl_X_Y.npz`
4. `all-intent_tst_X_Y.npz`
5. `all-intent_lbl_X_Y.npz`
6. `intent.raw.csv`
7. `all-intent.raw.csv`
5. `substring_all-intent_X_Y.npz`
6. `derived-substring_all-intent_X_Y.npz`
7. `derived-intent_all-intent_X_Y.npz`
9. `all-intent_substring.raw.csv`
10. `all-intent_derived-substring.raw.csv`
11. `all-intent_derived-intent.raw.csv`

In [69]:
save_dir = f"/Users/suchith720/Projects/data/beir/msmarco/XC/intent_substring/"

In [78]:
save_intents(save_dir, trn_intent, tst_intent, lbl_intent, intent_ids, intent_txt)

% of training intents in test: 2.3782
# of document with intent metadata: 513006/8841823


In [79]:
save_intent_metadata(save_dir, intent_phrase, phrase_vocab, intent_intents, intents_vocab, 
                     intent_substr, substr_vocab)

## `Driver`

In [64]:
#| export
if __name__ == "__main__":
    input_args = parse_args()
    
    # files
    data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC"
    intent_file = "/Users/suchith720/Desktop/Projects/mogicX/data/gpt_substring/intent_substring/msmarco_intent_substring.csv"
    
    save_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC/intent_substring/"
    
    # load raw data
    
    trn_ids, trn_txt = load_raw_file(f"{data_dir}/raw_data/train.raw.txt")
    tst_ids, tst_txt = load_raw_file(f"{data_dir}/raw_data/test.raw.txt")
    lbl_ids, lbl_txt = load_raw_file(f"{data_dir}/raw_data/label.raw.txt")
    
    df = pd.read_csv(intent_file)
    lbl_txt2ids = {v:k for k,v in zip(lbl_ids, lbl_txt)}
    df["document_id"] = [lbl_txt2ids[o] for o in df["document"]]
    
    outputs = convert_df_to_query_dict(df)
    
    # extract metadata
    o = extract_intents(outputs:Dict, trn_ids, tst_ids, lbl_ids, seed=100)
    (trn_intent, tst_intent, lbl_intent), (intent_ids, intent_txt), (intent_phrase, phrase_vocab), (intent_intents, intents_vocab), (intent_substr, substr_vocab) = o

    # save metadata
    save_intents(save_dir, trn_intent, tst_intent, lbl_intent, intent_ids, intent_txt)
    save_intent_metadata(save_dir, intent_phrase, phrase_vocab, intent_intents, intents_vocab, 
                         intent_substr, substr_vocab)
    
    

## `Display examples`

In [32]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC"

trn_ids, trn_txt = load_raw_file(f"{data_dir}/raw_data/train.raw.txt")
tst_ids, tst_txt = load_raw_file(f"{data_dir}/raw_data/test.raw.txt")

In [393]:
np.random.seed(1000)
rnd_idx = [10, 100, 1000, 10_000, 100_000] + np.random.permutation(trn_substr.shape[0])[:10].tolist()

In [396]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC/narrow_substring/"

trn_substr = sp.load_npz(f"{data_dir}/substring_trn_X_Y.npz")
substr_ids, substr_txt = load_raw(f"{data_dir}/raw_data/substring.raw.csv")

In [401]:
for idx in rnd_idx:
    print(f"Query: {trn_txt[idx]}")
    substr = ", ".join([substr_txt[i] for i in trn_substr[idx].indices])
    print(f"Substring: {substr}", end="\n\n")
    

Query: most dependable affordable cars
Substring: Suzuki SX4, most reliable and affordable all-wheel-drive cars

Query: temperature sorrento italy september
Substring: average daily temperature is 21Â°C

Query: meaning of adherence
Substring: related to the verb adhere, meaning “to stick”, a person who adheres to a plan

Query: how many people has trump fired in his administration?
Substring: fired 3 officials

Query: what is the sales tax for leander, tx
Substring: 8.250%

Query: wadesboro population
Substring: 5,711

Query: how long does it take for sperm to die once it hits air
Substring: up to three hours in the open air

Query: what is the akc
Substring: in the United States, registry of purebred dog pedigrees

Query: what is wrong with your eyes when they are very itchy with contacts and you start seeing blurry
Substring: dry eye syndrome, eyes are dry, eyes are irritated by the contacts

Query: histone meaning
Substring: soluble in water, a simple protein, insoluble in dilute am

In [402]:
data_dir = "/Users/suchith720/Projects/data/beir/msmarco/XC/intent_substring/"

trn_intent = sp.load_npz(f"{data_dir}/intent_trn_X_Y.npz")
intent_ids, intent_txt = load_raw(f"{data_dir}/raw_data/intent.raw.csv")

In [403]:
for idx in rnd_idx:
    print(f"Query: {trn_txt[idx]}")
    intent = ", ".join([intent_txt[i] for i in trn_intent[idx].indices])
    print(f"Intent: {intent}", end="\n\n")
    

Query: most dependable affordable cars
Intent: Suzuki SX4 is a dependable and low-cost vehicle

Query: temperature sorrento italy september
Intent: average September day temperature in Sorrento is 21°C

Query: meaning of adherence
Intent: the meaning of adherence is to stick

Query: how many people has trump fired in his administration?
Intent: Trump has fired three officials connected to his administration

Query: what is the sales tax for leander, tx
Intent: Leander, TX currently charges 8.250% sales tax

Query: wadesboro population
Intent: Wadesboro's population is 5,711 people

Query: how long does it take for sperm to die once it hits air
Intent: in ideal conditions sperm survives up to three hours

Query: what is the akc
Intent: AKC serves as a purebred dog pedigree registry, AKC functions within the United States

Query: what is wrong with your eyes when they are very itchy with contacts and you start seeing blurry
Intent: eye dryness can lead to blurry vision when wearing conta

In [20]:
np.random.seed(1000)
idxs = np.random.choice(list(outputs.keys()), size=10)

In [31]:
for i in idxs:
    o = outputs[i]
    example = {
        "Query": o["query"],
        "Document": o["document"],
        "Intent": [np.random.choice(x["intent_phrases"]) for x in o["substring"]],
    }
    print(json.dumps(example, indent=4))

{
    "Query": "is lamb a baby sheep",
    "Document": "There are no other names for a baby lamb because baby sheep are called lambs. A lamb is a young sheep that is less than 14-months-old, so newborn sheep are called lambs.",
    "Intent": [
        "baby sheep go by the name lamb",
        "lamb describes a young sheep",
        "lambs are sheep less than 14 months in age"
    ]
}
{
    "Query": "what kind of beauty is lord byron defining in she walks in beauty",
    "Document": "She Walks in Beauty Analysis. 1  In the first line of She Walks in Beauty, Lord Byron compares a woman's beauty to the night. He uses a simile to liken her beauty to that of cloudless climes and starry skies, emphasizing the clarity and the brightness of her beauty. She Walks in Beauty is a short poem consisting of three stanzas of six lines each.",
    "Intent": [
        "She Walks in Beauty opens by comparing her beauty to the night",
        "the poem likens her beauty to cloudless climes and starry ski