In [1]:
import pandas as pd
from tqdm import tqdm
from src.config import CLEAN_FILE_PATH

data = pd.read_parquet(str(CLEAN_FILE_PATH))

data.head(5)

Unnamed: 0,id,title,date,content,domain,url
0,92151dc1d9bfd38bf55079fd769ba2bb,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,Qatar Petroleum QP is targeting aggressive cut...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,9f0a279c373edf56b634cf6dd1e38698,India Launches Its First 700 MW PHWR,2021-01-15,Nuclear Power Corp. of India Ltd. NPCIL synchr...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,7c349533119a90fa56515421e69c0e45,New Chapter for US-China Energy Trade,2021-01-20,New US President Joe Biden took office this we...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,5acd9a610210fdf0afd261c35b1fe287,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,The slow pace of Japanese reactor restarts con...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,2159fa0bb2c86e538844a6307bb9b677,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,Two of New York City s largest pension funds s...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [3]:
import src.utils as utils

documents = utils.create_documents(data, recursive_text_splitter)

RecursiveCharacterTextSplitter: Number of documents created: 51050, Number of rows in source df: 9584, Percentage of documents created: 532.66%


In [4]:
documents[0]

Document(metadata={'url': 'https://www.energyintel.com/0000017b-a7dc-de4c-a17b-e7de685b0000', 'domain': 'energyintel', 'title': 'Qatar to Slash Emissions as LNG Expansion Advances', 'date': '2021-01-13', 'origin_doc_id': '92151dc1d9bfd38bf55079fd769ba2bb'}, page_content='Qatar Petroleum QP is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion. In its latest Sustainability Report published on Wednesday, QP said its goals include reducing the emissions intensity of Qatar s LNG facilities by 25 and of its upstream facilities by at least 15 . The company is also aiming to reduce gas flaring intensity across its upstream facilities by more than 75 and has raised its carbon capture and storage ambitions from 5 million tons yr to 7 million tons yr by 2027. About 2.2 million tons yr of the carbon capture goal will come from the 32 million ton yr Phase 1 of the LNG expansion, also known as the North Fie

In [5]:
from src.custom_embeddings import bge_m3_embed, qwen2_embed, nomic_embed

embedding_models = [bge_m3_embed, qwen2_embed, nomic_embed]

In [6]:
for model in embedding_models:
    print(model.model_name)

    embedding = model.embed_query("The company is also aiming to reduce gas flaring?")

    print(embedding[:20])
    print()

BAAI_bge_m3
[-0.06979186, -0.025753867, -0.03614089, -0.024861958, -0.0046290997, -0.040024407, 0.0053514526, 0.048534703, -0.022353465, 0.04812591, -0.018711504, 0.0028638635, 0.013053457, -0.02528933, 0.019399017, -0.07191015, 0.06280524, -0.033632394, 0.00043143766, -0.0013761874]

Alibaba-NLP_gte-Qwen2-7B
[-0.018390294, 0.012254118, -0.00019189798, -0.0013482721, -0.020569412, 0.0106858825, -0.008780294, -0.019365883, -0.010403235, -0.029067058, 0.0055389707, 0.008411029, 0.0028378677, 0.0095735295, 0.0027125, 0.0008405331, -0.011944118, 0.004219191, 0.0076861763, 0.011488236]

nomic-ai_nomic-embed-text-v1_5
[0.023326509, 0.012770157, -0.20600034, 0.049915466, 0.009018341, -0.0049664956, -0.035630602, 0.011669081, -0.0045382995, -0.051034022, 0.08799291, 0.02442176, 0.080629095, 0.033952773, 0.07275261, -0.0061811754, -0.022289516, -0.016545277, -0.012478867, -0.0034721778]



In [7]:
from src.vectorstorage import EmbeddingVectorStorage


def get_col_name_vectordb(embeddings, text_splitter):
    return f"{embeddings.model_name}_{text_splitter.__class__.__name__}"


"""bge_m3_vectordb = EmbeddingVectorStorage(
    method_of_embedding=bge_m3_embed,
    collection=get_col_name_vectordb(bge_m3_embed, recursive_text_splitter),
)

get_col_name_vectordb(bge_m3_embed, recursive_text_splitter)"""

vector_stores = {}

for model in embedding_models:
    collection_name = get_col_name_vectordb(model, recursive_text_splitter)
    print(f"Collection name: {collection_name}")

    vector_storage = EmbeddingVectorStorage(
        method_of_embedding=model,
        collection=collection_name,
    )

    vector_storage.include_documents(documents, should_verbose=True)

    vector_stores[model.model_name] = vector_storage

print(vector_stores)

Collection name: BAAI_bge_m3_RecursiveCharacterTextSplitter


100%|██████████| 2/2 [15:20<00:00, 460.21s/it]


Collection name: Alibaba-NLP_gte-Qwen2-7B_RecursiveCharacterTextSplitter


100%|██████████| 2/2 [40:11<00:00, 1205.92s/it]


Collection name: nomic-ai_nomic-embed-text-v1_5_RecursiveCharacterTextSplitter


100%|██████████| 2/2 [02:42<00:00, 81.08s/it] 

{'BAAI_bge_m3': VectorStorage(method_of_embedding=CustomHuggingFaceEndpointEmbeddings, group=BAAI_bge_m3_RecursiveCharacterTextSplitter), 'Alibaba-NLP_gte-Qwen2-7B': VectorStorage(method_of_embedding=CustomHuggingFaceEndpointEmbeddings, group=Alibaba-NLP_gte-Qwen2-7B_RecursiveCharacterTextSplitter), 'nomic-ai_nomic-embed-text-v1_5': VectorStorage(method_of_embedding=CustomHuggingFaceEndpointEmbeddings, group=nomic-ai_nomic-embed-text-v1_5_RecursiveCharacterTextSplitter)}





In [8]:
query = "The company is also aiming to reduce gas flaring?"

for model_name, vector_store in vector_stores.items():
    print(f"Results for model: {model_name}")

    try:
        results = vector_store.search_similar_w_scores(query)

        for doc, score in results:
            print(f"Document: {doc}")
            print(f"Score: {score}")
        print()
    except Exception as e:
        print(f"Error searching in vector store '{model_name}': {e}")
        print()

Results for model: BAAI_bge_m3
Document: page_content='their natural gas prices off of NGI s Mexico Gas Price Index. Read the analysis of our fifth survey of active players in Mexico, showcasing market driven insights now. , The company plans to electrify its operations with low carbon power, which may include wind, solar, hydrogen, natural gas with carbon capture and storage, or other emerging technologies, the firm said. ExxonMobil plans to expand its methane detection programs utilizing satellite surveillance and a network of ground based sensors for continuous monitoring, and aerial flyovers that identify leaks for rapid repairs. , By the end of this year, ExxonMobil plans to reduce flaring volumes across its Permian operations by more than 75 versus 2019, and to eliminate all routine flaring in the basin by the end of 2022. The company is also securing alternative natural gas delivery points across the basin to minimize non routine flaring, management said. , The net zero goal for