In [5]:
import os

current_dir = os.getcwd()

# Get the parent directory path
parent_dir = os.path.dirname(current_dir)

# Change the current working directory to the parent directory which should be the root folder
os.chdir(parent_dir)

# Verify the new working directory
new_dir = os.getcwd()
print("Current working directory:", new_dir)


Current working directory: d:\PROJECTS\REVIEWED PROJECTS\camille_projects-p03_semantic-search\semantic-search


In [6]:
try:
    from src import config
    from src import utils
    from src import preprocessor
    from src import build_index
    from src import embeddings
    from src import search
except:
    import config
    import utils
    import preprocessor
    import build_index
    import embeddings

import pandas as pd
import os
from pprint import pprint
import logging
from functools import reduce


logger = utils.logger

In [None]:
# 1. Collect and preprocess data: The preprocessed data will be dumped into a new file
logger.info("Collecting and preprocessing data ...")
output_filename = utils.get_raw_data_from_aws_mongo()
r = preprocessor.preprocess_data(output_filename, section_by=config.TEXT_SECTION_TYPE, input_types=config.TRAIN_DATA_INPUT_TYPES, sample_size=config.SAMPLE_SIZE)


In [5]:
# 2. Compute embeddings: This will dump the embeddings into a new file
pdf = pd.read_csv(r["path_to_processed_text"])

data = dict(
            zip(
                pdf["section_id"].values.tolist(), 
                embeddings.get_embeddings_from_lemmatized_sentences(pdf["text"].values.tolist())
            )
        )
utils.save_json(data, os.path.join(config.DATA_DIR, "processed", f"{config.TEXT_SECTION_TYPE}_" + "_".join(config.TRAIN_DATA_INPUT_TYPES) + "_embeddings.json"))

In [6]:
# 3. Load embeddings and build annoy index

_embeddings = utils.load_json(os.path.join(config.DATA_DIR, "processed", f"{config.TEXT_SECTION_TYPE}_" + "_".join(config.TRAIN_DATA_INPUT_TYPES) + "_embeddings.json"))
index = build_index.init_index(index_type=config.SEARCH_INDEX_TYPE)
index.build(_embeddings, os.path.join(config.MODELS_DIR, config.SEARCH_INDEX))


In [7]:
index.load(os.path.join(config.MODELS_DIR, config.SEARCH_INDEX))
ids_mapper = utils.load_json(os.path.join(config.MODELS_DIR, f"{config.TEXT_SECTION_TYPE}_" + "_".join(config.TRAIN_DATA_INPUT_TYPES) + "_ids.json"))
query_emb = [_embeddings["0"], _embeddings["1"], _embeddings["20"]]
results = index.search(query_emb, 5, ids_lookup=ids_mapper["section_id_to_article_id"])
print(results)

[[(0, 0.0), (618, 0.9278080463409424), (13039, 0.9894830584526062), (2865, 0.9895022511482239), (154, 0.9913226366043091)], [(1, 0.0), (3101, 0.9495759010314941), (228, 0.9650539755821228), (13057, 0.9904249310493469), (993, 0.990619421005249)], [(20, 0.0), (9652, 1.0283305644989014), (13144, 1.099655270576477), (13189, 1.1079797744750977), (5983, 1.1306354999542236)]]


In [15]:
ids_mapper = utils.load_json(os.path.join(config.MODELS_DIR, f"{config.TEXT_SECTION_TYPE}_" + "_".join(config.TRAIN_DATA_INPUT_TYPES) + "_ids.json"))
sections_stats = utils.load_json(os.path.join(config.MODELS_DIR, f"{config.TEXT_SECTION_TYPE}_{'_'.join(config.TRAIN_DATA_INPUT_TYPES)}_stats.json"))["sections_by_article"]

# queries = [lookup[i]["title"] for i in (0, 1)]
queries = [
    "Carbon emission",
    "debt reduction among companies",
    "lack of medical devices in hospitals",
    "covid-19 vaccine development",
    "women entrepreneuship in india",
]

print(queries)
results = search.search(index, queries, 5, ids_mapper, sections_stats)
pprint(results)

['Carbon emission', 'debt reduction among companies', 'lack of medical devices in hospitals', 'covid-19 vaccine development', 'women entrepreneuship in india']
[{'query': 'Carbon emission', 'results': []},
 {'query': 'debt reduction among companies',
  'results': [{'article_id': 1386,
               'category': 'business, finance & economics',
               'score': 0.53,
               'subcategory': 'equity - private & public',
               'title': 'We Think Balco Group (STO:BALCO) Can Stay On Top Of '
                        'Its Debt'},
              {'article_id': 13029,
               'category': 'us media',
               'score': 0.48,
               'subcategory': 'us - entrepreneur & startup',
               'title': 'Egyptian B2B trucking startup Trella secures further '
                        '$6m debt funding'},
              {'article_id': 1389,
               'category': 'business, finance & economics',
               'score': 0.46,
               'subcategory': 'eq

In [None]:
'''
{
    "query" :[
    "Carbon emission",
    "debt reduction among companies",
    "lack of medical devices in hospitals",
    "covid-19 vaccine development",
    "women entrepreneuship in india"
],
    "k" : 3
}
'''