In [1]:
import pandas as pd
from tqdm import tqdm

df = pd.read_parquet('../data/preprocessed/clean_cleantech.parquet')

df.head(5)

Unnamed: 0,id,title,date,content,domain,url
0,92151dc1d9bfd38bf55079fd769ba2bb,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,Qatar Petroleum QP is targeting aggressive cut...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,9f0a279c373edf56b634cf6dd1e38698,India Launches Its First 700 MW PHWR,2021-01-15,Nuclear Power Corp. of India Ltd. NPCIL synchr...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,7c349533119a90fa56515421e69c0e45,New Chapter for US-China Energy Trade,2021-01-20,New US President Joe Biden took office this we...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,5acd9a610210fdf0afd261c35b1fe287,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,The slow pace of Japanese reactor restarts con...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,2159fa0bb2c86e538844a6307bb9b677,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,Two of New York City s largest pension funds s...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [3]:
def create_documents(df: pd.DataFrame, text_splitter, verbose=True):
    metadata_cols = ['url', 'domain', 'title', 'date', 'id']
    if not all(col in df.columns for col in metadata_cols + ['content']):
        raise ValueError(
            f"DataFrame must contain all metadata columns and a 'content' column: {metadata_cols + ['content']}")

    metadata = df[metadata_cols].rename(columns={'id': 'origin_doc_id'}).to_dict('records')
    for i, m in enumerate(metadata):
        metadata[i] = {k: 'None' if v is None else v for k, v in m.items()}

    docs = text_splitter.create_documents(df['content'], metadata)

    if verbose:
        print(
            f"{text_splitter.__class__.__name__}: "
            f"Number of documents created: {len(docs)}, "
            f"Number of rows in source df: {len(df)}, "
            f"Percentage of documents created: {len(docs) / len(df) * 100:.2f}%")

    return docs


documents = create_documents(df, recursive_text_splitter)

RecursiveCharacterTextSplitter: Number of documents created: 51050, Number of rows in source df: 9584, Percentage of documents created: 532.66%


In [4]:
documents[0]

Document(metadata={'url': 'https://www.energyintel.com/0000017b-a7dc-de4c-a17b-e7de685b0000', 'domain': 'energyintel', 'title': 'Qatar to Slash Emissions as LNG Expansion Advances', 'date': '2021-01-13', 'origin_doc_id': '92151dc1d9bfd38bf55079fd769ba2bb'}, page_content='Qatar Petroleum QP is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion. In its latest Sustainability Report published on Wednesday, QP said its goals include reducing the emissions intensity of Qatar s LNG facilities by 25 and of its upstream facilities by at least 15 . The company is also aiming to reduce gas flaring intensity across its upstream facilities by more than 75 and has raised its carbon capture and storage ambitions from 5 million tons yr to 7 million tons yr by 2027. About 2.2 million tons yr of the carbon capture goal will come from the 32 million ton yr Phase 1 of the LNG expansion, also known as the North Fie

In [5]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings


with open('../secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()
                     
bge_m3_embed = HuggingFaceEndpointEmbeddings(
    model='http://100.67.185.22:8080',
    huggingfacehub_api_token=token,
    model_kwargs={"normalize_embeddings": True}
)

In [6]:
bge_m3_embed.embed_query('Hello, world! This is a test.')

[-0.017497413,
 0.015319829,
 -0.047388826,
 0.007870958,
 -0.01882123,
 -0.028452482,
 -0.0046765287,
 -0.00033455167,
 0.023272326,
 0.0027867316,
 -0.02931584,
 0.006585512,
 0.02601589,
 0.004391141,
 0.04009824,
 -0.01325736,
 0.043052845,
 -0.026898434,
 -0.035263427,
 -0.036759917,
 -0.061279316,
 -0.016480567,
 0.015684359,
 -0.0039378772,
 -0.0153582,
 0.048616715,
 -0.034361694,
 -0.008355399,
 0.026821692,
 -0.013007945,
 0.043091215,
 0.012758531,
 -0.00972718,
 -0.03720119,
 -0.032903578,
 -0.027243778,
 0.013218989,
 -0.01319021,
 -0.029757114,
 0.00493074,
 -0.0018646161,
 -0.020624692,
 0.031253602,
 -0.052952696,
 -0.015847437,
 -0.013573925,
 -0.039522666,
 -0.026955992,
 -0.042784244,
 -0.015310236,
 0.017372705,
 0.029968157,
 0.04896206,
 0.011482677,
 0.008643185,
 -0.006892484,
 0.014427692,
 -0.015415758,
 -0.058171224,
 0.0005596846,
 -0.03887035,
 0.028989684,
 -0.0022327427,
 0.013967233,
 0.0019821287,
 0.13015619,
 0.026265305,
 0.003443844,
 -0.02018342,
 

In [7]:
from src.vectorstorage import EmbeddingVectorStorage

def get_col_name_vectordb(embeddings, text_splitter):
    return f"bge_m3_embed_{text_splitter.__class__.__name__}"

bge_m3_vectordb = EmbeddingVectorStorage(
    method_of_embedding=bge_m3_embed,
    group=get_col_name_vectordb(bge_m3_embed, recursive_text_splitter),
)

get_col_name_vectordb(bge_m3_embed, recursive_text_splitter)

'bge_m3_embed_RecursiveCharacterTextSplitter'

In [8]:
bge_m3_vectordb.include_documents(documents, should_verbose=True)

100%|██████████| 2/2 [03:09<00:00, 94.76s/it] 


In [9]:
bge_m3_vectordb.search_similar_w_scores("The company is also aiming to reduce gas flaring?")

[(Document(metadata={'date': '2021-12-13', 'domain': 'naturalgasintel', 'origin_doc_id': '955ad75cdc5396c0057252d76df2c445', 'title': 'ExxonMobil Envisions Carbon-Neutral Permian Operations by 2030', 'url': 'https://www.naturalgasintel.com/exxonmobil-envisions-carbon-neutral-permian-operations-by-2030/'}, page_content='their natural gas prices off of NGI s Mexico Gas Price Index. Read the analysis of our fifth survey of active players in Mexico, showcasing market driven insights now. , The company plans to electrify its operations with low carbon power, which may include wind, solar, hydrogen, natural gas with carbon capture and storage, or other emerging technologies, the firm said. ExxonMobil plans to expand its methane detection programs utilizing satellite surveillance and a network of ground based sensors for continuous monitoring, and aerial flyovers that identify leaks for rapid repairs. , By the end of this year, ExxonMobil plans to reduce flaring volumes across its Permian ope