In [1]:
import os
import tempfile
 
from primeqa.ir.dense.colbert_top.colbert.utils.utils import create_directory, print_message
from primeqa.ir.dense.colbert_top.colbert.infra import Run, RunConfig
from primeqa.ir.dense.colbert_top.colbert.infra.config import ColBERTConfig
from primeqa.ir.dense.colbert_top.colbert.training.training import train
from primeqa.ir.dense.colbert_top.colbert.indexing.collection_indexer import encode
from primeqa.ir.dense.colbert_top.colbert.searcher import Searcher
import pandas as pd
import numpy as np
import torch
from IPython.display import display, HTML

  from .autonotebook import tqdm as notebook_tqdm


{"time":"2023-10-04 20:17:41,826", "name": "faiss.loader", "level": "INFO", "message": "Loading faiss."}
{"time":"2023-10-04 20:17:41,846", "name": "faiss.loader", "level": "INFO", "message": "Successfully loaded faiss."}
{"time":"2023-10-04 20:17:41,918", "name": "numexpr.utils", "level": "INFO", "message": "Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8."}
{"time":"2023-10-04 20:17:41,919", "name": "numexpr.utils", "level": "INFO", "message": "NumExpr defaulting to 8 threads."}


In [2]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"

random_seed = 2 # or any of your favorite number 
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)

In [3]:
text_triples_fn  = '/home1/tejomay/NL2bash/train_sbert_triples.tsv'
model_type = 'roberta-base'
with tempfile.TemporaryDirectory() as working_dir:
    output_dir=os.path.join(working_dir, 'output_dir')

In [4]:
args_dict = {
                'root': output_dir,
                'experiment': 'test_training',
                'triples': text_triples_fn,
                'model_type': model_type,
                'bsize': 16,
                'epochs': 3,
                'nranks': 1,
                'amp' : True,
                # 'lr' : 3e-5
            }

In [5]:
with Run().context(RunConfig(root=args_dict['root'], experiment=args_dict['experiment'], nranks=args_dict['nranks'], amp=args_dict['amp'])):
    colBERTConfig = ColBERTConfig(**args_dict)
    latest_model_fn = train(colBERTConfig, text_triples_fn, None, None)

{
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": null,
    "index_location": null,
    "nbits": 1,
    "kmeans_niters": 20,
    "num_partitions_max": 10000000,
    "similarity": "cosine",
    "bsize": 16,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "resume": false,
    "resume_optimizer": false,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "shuffle_every_epoch": false,
    "save_steps": 2000,
    "save_epochs": -1,
    "epochs": 3,
    "input_arguments": {},
    "model_type": "roberta-base",
    "init_from_lm": null,
    "local_models_repository": null,
    "ranks_fn": null,
    "output_dir": null,
    "topK": 100,
    "student_teacher_temperature": 1.0,
    "student_teacher_top_loss_weight": 0.5,
    "teacher_model_type": "xlm-roberta-base",
   

Some weights of the model checkpoint at roberta-base were not used when initializing HF_ColBERT_Roberta: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing HF_ColBERT_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HF_ColBERT_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HF_ColBERT_Roberta were not initialized from the model checkpoint at roberta-base and are newly initialized: ['bert.encoder.layer.9.attention.self.value.bias', 'bert.encoder.layer.9.output.dense.bias', 'bert.encoder.layer.5.attention.output.LayerNorm.weight', 'be

{"time":"2023-10-04 20:18:11,126", "name": "torch.distributed.distributed_c10d", "level": "INFO", "message": "Added key: store_based_barrier_key:1 to store for rank: 0"}
{"time":"2023-10-04 20:18:11,130", "name": "torch.distributed.distributed_c10d", "level": "INFO", "message": "Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes."}
[Oct 04, 20:18:11] 3 epochs of 17247 examples
[Oct 04, 20:18:11] batch size: 16
[Oct 04, 20:18:11] maxsteps set to 3234
[Oct 04, 20:18:11] start batch idx: 0
[Oct 04, 20:18:11] #> Roberta QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
[Oct 04, 20:18:11] #> Input: $ get the label of a fat32 partition, 		 True, 		 None
[Oct 04, 20:18:11] #> Output IDs: torch.Size([32]), tensor([    0, 50261,   120,     5,  6929,     9,    10,  5886,  2881, 34255,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
   



[Oct 04, 20:18:11] #>>>>> D: torch.Size([99, 768]), tensor([[-3.4392e-02,  7.8284e-02, -2.5861e-02,  ..., -8.6018e-02,
         -4.3695e-02,  2.9021e-02],
        [-7.9147e-04,  6.3184e-02, -8.0885e-02,  ..., -7.6783e-01,
          2.3322e-01,  2.6924e-01],
        [-1.6554e-01,  2.2936e-01, -6.7748e-02,  ..., -5.6548e-01,
          8.8873e-05,  5.7844e-02],
        ...,
        [-3.3283e-02,  1.7113e-01, -4.8443e-02,  ..., -3.8648e-02,
          2.3856e-01,  4.0179e-01],
        [-4.2342e-02,  2.7886e-02, -1.7624e-02,  ..., -3.0329e-01,
          6.8351e-02,  7.0057e-02],
        [ 6.4403e-02,  9.1860e-02, -4.8039e-02,  ..., -4.9786e-01,
          1.5160e-01,  2.0572e-01]], device='cuda:0',
       grad_fn=<SelectBackward0>)
[Oct 04, 20:18:11] #>>>>> self.linear doc : Parameter containing:
tensor([[ 0.0086,  0.0093,  0.0139,  ..., -0.0191, -0.0069,  0.0419],
        [ 0.0118, -0.0243, -0.0022,  ...,  0.0394,  0.0157, -0.0088],
        [-0.0005, -0.0073, -0.0413,  ...,  0.0431,  0.0153,

In [13]:
collection_fn = '/home1/tejomay/NL2bash/index_descriptions.tsv'

data = pd.read_csv(collection_fn, sep='\t')

print(data.head(3))
print("---------------------------")
args_dict = {
                'root': os.path.join(output_dir,'test_indexing'),
                'experiment': 'test_indexing',
                'checkpoint': latest_model_fn,
                'collection': collection_fn,
                'index_root': os.path.join(output_dir, 'test_indexing', 'indexes'),
                'index_name': 'index_name',
                'doc_maxlen': 512,
                # 'num_partitions_max': 2,
                # 'kmeans_niters': 1,
                # 'nway': 1,
                # 'rank': 0,
                'nranks': 1,
                'amp' : True,
            }

   id                                               text      title
0   1  fatlabel display change volume label volume ID...   fatlabel
1   2  apt-cache performs variety operations APT 's p...  apt-cache
2   3  updatedb creates updates database used locate ...   updatedb
---------------------------


In [14]:
with Run().context(RunConfig(root=args_dict['root'], experiment=args_dict['experiment'], nranks=args_dict['nranks'], amp=args_dict['amp'])):
    colBERTConfig = ColBERTConfig(**args_dict)
    create_directory(colBERTConfig.index_path_)
    encode(colBERTConfig, collection_fn, None, None)



[Oct 04, 20:46:04] #> Creating directory /tmp/tmpi75hzfrb/output_dir/test_indexing/indexes/index_name 


{
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": null,
    "index_location": null,
    "nbits": 1,
    "kmeans_niters": 20,
    "num_partitions_max": 10000000,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "resume": false,
    "resume_optimizer": false,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "shuffle_every_epoch": false,
    "save_steps": 2000,
    "save_epochs": -1,
    "epochs": 10,
    "input_arguments": {},
    "model_type": "bert-base-uncased",
    "init_from_lm": null,
    "local_models_repository": null,
    "ranks_fn": null,
    "output_dir": null,
    "topK": 100,
    "student_teacher

0it [00:00, ?it/s]

[Oct 04, 20:47:30] [0] 		 #> Encoding 1270 passages..
[Oct 04, 20:47:36] [0] 		 #> Saving chunk 0: 	 1,270 passages and 206,651 embeddings. From #0 onward.


1it [00:06,  6.35s/it]

[Oct 04, 20:47:36] offset: 0
[Oct 04, 20:47:36] chunk codes size(0): 206651
[Oct 04, 20:47:36] codes size(0): 206651
[Oct 04, 20:47:36] codes size(): torch.Size([206651])
[Oct 04, 20:47:36] >>>>partition.size(0): 2048
[Oct 04, 20:47:36] >>>>num_partition: 2048
[Oct 04, 20:47:36] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 04, 20:47:36] #> Building the emb2pid mapping..
[Oct 04, 20:47:36] len(emb2pid) = 206651



100%|██████████| 2048/2048 [00:00<00:00, 48693.29it/s]

[Oct 04, 20:47:36] #> Saved optimized IVF to /tmp/tmpi75hzfrb/output_dir/test_indexing/indexes/index_name/ivf.pid.pt
[Oct 04, 20:47:36] [0] 		 #> Saving the indexing metadata to /tmp/tmpi75hzfrb/output_dir/test_indexing/indexes/index_name/metadata.json ..





In [15]:
queries_fn = "/home1/tejomay/NL2bash/queries.tsv"

args_dict = {
                'root': output_dir,
                'experiment': 'test_indexing' ,
                'checkpoint': latest_model_fn,
                'model_type': model_type,
                'index_location': os.path.join(output_dir, 'test_indexing', 'indexes', 'index_name'),
                'queries': queries_fn,
                'bsize': 1,
                'topK': 3,
                # 'nway': 1,
                'rank': 0,
                'nranks': 1,
                'amp': True,
            }

In [16]:
with Run().context(RunConfig(root=args_dict['root'], experiment=args_dict['experiment'], nranks=args_dict['nranks'], amp=args_dict['amp'])):
    colBERTConfig = ColBERTConfig(**args_dict)
    searcher = Searcher(args_dict['index_location'], checkpoint=args_dict['checkpoint'], config=colBERTConfig)
    rankings = searcher.search_all(args_dict['queries'], args_dict['topK'])

[Oct 04, 20:48:51] #> base_config.py from_path /tmp/tmpi75hzfrb/output_dir/test_indexing/indexes/index_name/metadata.json
[Oct 04, 20:48:51] #> base_config.py from_path args loaded! 
[Oct 04, 20:48:51] #> base_config.py from_path args replaced ! 
[Oct 04, 20:48:51] #> base_config.py load_from_checkpoint /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert
[Oct 04, 20:48:51] #> base_config.py load_from_checkpoint /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert/artifact.metadata
[Oct 04, 20:48:51] #> base_config.py from_path /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert/artifact.metadata
[Oct 04, 20:48:51] #> base_config.py from_path args loaded! 
[Oct 04, 20:48:51] #>>>>> at ColBERT name (model type) : /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert
[Oct 04, 20:48:51] #>>>>> at BaseColBERT name (model type) : /tmp/tmpi75hzfrb/output_dir/test_training/2023-

[Oct 04, 20:48:55] json file (get_query_tokenizer): /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert/config.json
[Oct 04, 20:48:55] get query model type: roberta-base
[Oct 04, 20:48:58] json file (get_doc_tokenizer): /tmp/tmpi75hzfrb/output_dir/test_training/2023-10/04/20.17.41/checkpoints/colbert/config.json
[Oct 04, 20:48:58] get doc model type: roberta-base
[Oct 04, 20:49:01] #> Loading codec...
[Oct 04, 20:49:01] #> base_config.py from_path /tmp/tmpi75hzfrb/output_dir/test_indexing/indexes/index_name/metadata.json
[Oct 04, 20:49:01] #> base_config.py from_path args loaded! 
[Oct 04, 20:49:01] #> base_config.py from_path args replaced ! 
[Oct 04, 20:49:01] #> Loading IVF...
[Oct 04, 20:49:01] #> Loading the queries from /home1/tejomay/NL2bash/queries.tsv ...
[Oct 04, 20:49:01] #> Got 10 queries. All QIDs are unique.

[Oct 04, 20:49:01] #> Roberta QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
[Oct 04, 20:49:01] #> Input: $ get

100%|██████████| 10/10 [00:00<00:00, 91.54it/s]


In [17]:
with open(queries_fn, 'r') as f:
    for line in f.readlines():
        if str(rankings.flat_ranking[8][0]) == line.split()[0]:
            print(line)

with open(collection_fn, 'r') as f:
    for line in f.readlines():
        if str(rankings.flat_ranking[8][1]) == line.split()[0]:
            print(line)

3	search for a package in your current sources

535	Leiningen automating Clojure projects without setting hair fire . Working Clojure projects tools designed Java exercise frustration . Leiningen , write Clojure .	lein

