In [1]:
from itertools import product

from src.utils import *
from src.config import Config
from src.db_handler import DBHandler
from src.libshift_search import LibshiftSearch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = Config(dev_mode=False)

OUTPUT_PATH = 'output'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [None]:

LIBS = [ 'pydantic', 'scipy', 'pandas', 'sqlalchemy', 'numpy', 'pytorch']
FEATURES = ["name", "code", "docstring", "nodoc"]
TOPKs = [1, 3, 5, 7, 10, 15]
models = [
    'Alibaba-NLP/gte-large-en-v1.5',
    'avsolatorio/GIST-Embedding-v0',
    'avsolatorio/GIST-large-Embedding-v0',
    'ibm-granite/granite-embedding-125m-english',
    'intfloat/e5-large-v2',
    'w601sxs/b1ade-embed',    
]
combinations = [
    dict(zip(FEATURES, combo))
    for combo in product(models, repeat=len(FEATURES))
]

In [4]:

model_cols = [filter_read_cols(combo) for combo in combinations]
model_cols = list(set([col for sublist in model_cols for col in sublist]))
filter_cols =['id'] + FEATURES + model_cols

In [5]:
removed_df = pd.read_pickle(f"{config.ARTIFACTS_PATH}/removed_methods/removed_methods.pkl")
val_df = pd.read_csv(f"{config.ARTIFACTS_PATH}/validation_data/clean_val.csv")
folder = f"{config.ARTIFACTS_PATH}/snapshot_embeddings/"
snapshot_dict = get_snapshot_dict(folder, LIBS)



Loading snapshots: 100%|██████████| 6/6 [05:00<00:00, 50.01s/repo]

Loaded 6 snapshots from /Volumes/AnushHD/libshiftartifacts//snapshot_embeddings/





In [6]:
db = DBHandler(config)
results_df = pd.DataFrame()
search_df = pd.DataFrame()
libwise_agg_df = pd.DataFrame()
matches = []
for best_config in tqdm(combinations):
    search = LibshiftSearch(
        model_dict=best_config,
        removed_df=removed_df,
        snapshot_dictionary=snapshot_dict,
        validation_df=val_df,
        features=FEATURES,
        db_handler=db,
        top_ks=TOPKs,
        )
    mode = "cosine"
    search_data, results, match_json,combined_hits_df = search.controller(mode)
    output_path = os.path.join(OUTPUT_PATH, f"results_{mode}.csv")
    results_df = pd.concat([results_df, results], ignore_index=True)
    search_df['config'] = best_config
    search_df = pd.concat([search_df, search_data], ignore_index=True)
    matches.extend(match_json)


db.close()


Cleaning SQLite lock files in: /Volumes/AnushHD/libshiftartifacts/similarity_cache.db


Removed lock file: similarity_cache.db-wal


100%|██████████| 1296/1296 [1:40:43<00:00,  4.66s/it]


In [15]:
results_df.to_csv(os.path.join(OUTPUT_PATH, f"updated_grid_search_results_{mode}.csv"), index=False)



In [16]:
search_df.to_csv(os.path.join(OUTPUT_PATH, f"updated_grid_search_data_{mode}.csv"), index=False)

In [17]:
import json 

with open(os.path.join(OUTPUT_PATH, f"updated_grid_search_matches_{mode}.json"), 'w') as f:
    json.dump(matches, f, indent=4)