### DB Function to insert esg_text_table

In [1]:
import os
os.chdir('..') #go to dsa3101 folder as main

In [None]:
import psycopg2
import pandas as pd
from db.scripts.db_esg_text import insert_esg_text
df = pd.read_csv("./files/labeled_pdfs_1003.csv")

In [8]:
from tqdm import tqdm

#### Batch prepare esg_text and batch insertion

In [None]:
def batch_data_prepare_esg_text(df, batch_size):
    batch_data = [] #batch of data to append
    batches = [] #index of batches
    
    #batch data_preparation
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Prepare batches", unit="document", leave=True, ncols=100):
        batch_data.append((
            row['company'],
            row['year'],
            row['country'],
            row['industry'],
            row['esg_text'],
            row['labels']
        )) #appends a row to batch_data in tuple format for batch format

        if len(batch_data) >= batch_size: #eg 100-200?
            batches.append(batch_data)
            batch_data = [] #reset batch
    
    # Append leftovers as above code doesnt account for it
    batches.append(batch_data)
    return batches

batch = batch_data_prepare_esg_text(df,200)

In [None]:
from concurrent.futures import ProcessPoolExecutor
from db.scripts.db_esg_text_batch import insert_esg_text_batch
with ProcessPoolExecutor() as executor: #allows for parallel processing
    list(tqdm(executor.map(insert_esg_text_batch,batch), total=len(batch), desc='Insert batches into DB', unit='batch', ncols=100))


#### Single ESG_Text_Insert(Small Data)

In [None]:
insert_esg_text(df)

### Insert into vectorDB after chromaDB

In [None]:
# ASSUME THIS OCCURS
# WE STORE THE IDS, DOCUMENTS, METADATAS INTO A DB AND LOAD IT LATER TO THE CLIENT


#  client = chromadb.PersistentClient(path="./chromadb_1003")  # Stores DB in ./chroma_db
# collection = client.get_or_create_collection(name="dsa3101")
# logging.basicConfig(level=logging.WARNING)

# for index, row in tqdm(df.iterrows(), total=len(df), desc="Adding documents", unit="document", leave=True, ncols=100):
#     doc_text = row["esg_text"]  
#     doc_company = row["company"]  
#     doc_year = row["year"]  
#     doc_industry = row["industry"]
#     doc_id = f"doc_{index}"  

#     collection.add(
#         ids=[doc_id], 
#         documents=[doc_text],  
#         metadatas=[{"company": doc_company, "year": doc_year}] 
#     )

In [None]:
from tqdm import tqdm
from db.scripts.db_esg_vectorDB_batch import insert_esg_vectorDB 
import json
from concurrent.futures import ProcessPoolExecutor #Parallel Processing to speed up

In [None]:
for index, row in tqdm(df.iterrows(), total=len(df), desc="Adding documents", unit="document", leave=True, ncols=100):
    doc_text = row["esg_text"]  
    doc_company = row["company"]  
    doc_year = row["year"]  
    doc_industry = row["industry"]
    doc_id = f"doc_{index}"  

    insert_esg_vectorDB(
        doc_id, 
        doc_text,
        metadatas=json.dumps({"company": doc_company, "year": doc_year})
    )

In [15]:
import psycopg2
import json
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

def insert_esg_vectorDB_batch(batch_data):
    # Connect to PostgreSQL (replace with your actual credentials)
    conn = psycopg2.connect(
        host="localhost",
        database="postgres",
        user="postgres",
        password="123"
    )
    cur = conn.cursor()

    # Insert multiple rows using executemany
    insert_query = '''
        INSERT INTO esg_vectorDB (doc_id, doc_text, metadatas)
        VALUES (%s, %s, %s)
    '''

    # Use executemany to insert the batch of data
    cur.executemany(insert_query, batch_data)

    # Commit to the database
    conn.commit()

    # Close the cursor and connection
    cur.close()
    conn.close()

def process_batch(batch_data):
    """Helper function to be used in parallel processing"""
    insert_esg_vectorDB_batch(batch_data)

# Prepare to collect documents in batches
batch_size = 100  # Adjust this as necessary
batch_data = []
batches = []

# Collect documents into batches
for index, row in tqdm(df.iterrows(), total=len(df), desc="Preparing batches", unit="document", leave=True, ncols=100):
    doc_text = row["esg_text"]
    doc_company = row["company"]
    doc_year = row["year"]
    doc_industry = row["industry"]
    doc_id = f"doc_{index}"

    # Prepare metadata as a dictionary
    metadatas = json.dumps({
        "company": doc_company,
        "year": doc_year,
        "industry": doc_industry
    })

    # Append the data to the batch
    batch_data.append((doc_id, doc_text, metadatas))

    # If batch size is reached, create a batch and reset the list
    if len(batch_data) >= batch_size:
        batches.append(batch_data)
        batch_data = []  # Reset for the next batch

# Append any remaining data as a batch
if batch_data:
    batches.append(batch_data)

# Parallel processing of batches
with ProcessPoolExecutor() as executor:
    list(tqdm(executor.map(process_batch, batches), total=len(batches), desc="Inserting batches", unit="batch", ncols=100))

print("Parallel batch processing completed.")


Preparing batches:   4%|█                             | 2254/63903 [00:00<00:05, 11526.63document/s]

Preparing batches: 100%|█████████████████████████████| 63903/63903 [00:04<00:00, 15182.37document/s]
Inserting batches: 100%|██████████████████████████████████████| 640/640 [00:05<00:00, 107.09batch/s]

Parallel batch processing completed.





### Checking Format of chromaDB results

In [5]:
results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])

In [12]:
doc_id = results['ids']
doc_documents = results['documents']
doc_metadatas = results['metadatas']


In [13]:
query = "Retrieve percentage of reduction in Greenhouse gas emissions during the reporting year in the company. This can be in a) Total reduction, b) Scope 1 reduction and c) Scope 2 reduction"
results = collection.query(
    query_texts=[query],
   where={
        "$and": [
            {"company": "Apple"},
            {"year": 2022.0}
        ]
    },
    n_results=5
)

In [14]:
results

{'ids': [['doc_51558', 'doc_51413', 'doc_51407', 'doc_51406', 'doc_51420']],
 'embeddings': None,
 'documents': [['—> Continue reading on page 13  Reduced overall  emissions by 40%  In fiscal year 2021, our environmental  initiatives avoided over 23 million metric  tons of emissions across all scopes, and  we reduced our carbon footprint by  40 percent compared with fiscal year  2015.',
   'Without the methodology  change, these emissions would have increased by 14 percent, which reflects  the growth in our business.',
   'In fiscal year 2017, we started calculating scope 3 emissions not listed in  this table.',
   "Beginning in FY2021, we're accounting for scope 2 emissions from the  purchase of district heating, chilled water, and steam.",
   'When using the  same level of data granularity and model as 2021, our product use carbon  emissions in 2021 would have been about 2.5 percent lower.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'company': 'Apple', 'year': 2022.0},
   {'co