set wd

In [None]:
import os
os.chdir('..')

#### To create db, run db.py

In [8]:
from concurrent.futures import ProcessPoolExecutor
from db.scripts.db_esg_text_batch import insert_esg_text_batch
from tqdm import tqdm
from db.scripts.db_esg_vectorDB_batch import insert_esg_vectorDB_batch
from concurrent.futures import ProcessPoolExecutor #Parallel Processing to speed up
import json
from db.scripts.db_esg_vectorDB_batch import insert_esg_vectorDB_batch
from rapidfuzz import process, fuzz
import pandas as pd
import yahooquery as yq
import yfinance as yf
from db.scripts.db_insert_stocks import insert_stocks
from db.scripts.db_insert_roa_roe import insert_roa_roe
from dotenv import load_dotenv
import psycopg2
from db.scripts.get_ticker_symbol import get_ticker_symbol
from db.scripts.get_roa_roe import get_roa_roe
from db.scripts.get_stocks import get_stocks

### To change DB Insertion from Local to Supabase, go to db scripts and uncomment supabase lines(including conn) and comment the local db lines(including conn)


### Batch functions require ProcessorPool Function to run, or have error, if dont wanna use batch then use the scripts without batch 

### Insert into esg_text_table with Batch Processing

In [4]:
df = pd.read_csv("./files/labeled_pdfs_1603.csv") ## automate this to be the df from esg_bert

In [16]:
batch = batch_data_prepare_esg_text(df,200)

Prepare batches: 100%|███████████████████████████████| 63903/63903 [00:03<00:00, 18062.75document/s]


#### DB Insertion Need the Process Pool Executore for Batch Processing

In [6]:
from concurrent.futures import ProcessPoolExecutor
from db.scripts.db_esg_text_batch import insert_esg_text_batch
with ProcessPoolExecutor() as executor: #allows for parallel processing
    list(tqdm(executor.map(insert_esg_text_batch,batch), total=len(batch), desc='Insert batches into DB', unit='batch', ncols=100))


Insert batches into DB: 100%|██████████████████████████████████| 320/320 [00:04<00:00, 73.66batch/s]


#### ChromaDB --> WIP, Waiting for complete code before i can create DB insertion script but we can persist data 

In [None]:
from huggingface_hub import HfApi
from datasets import load_dataset
from huggingface_hub import snapshot_download

dataset = 'alexxtm/3101_proj_chromaDB' ##use huggingface key or just pull our own from github


snapshot_download(local_dir="./test", repo_id=dataset, repo_type='dataset')
import chromadb
client = chromadb.PersistentClient(path="./chromatest")  # Stores DB in ./chroma_db
collection = client.get_or_create_collection(name="dsa3101")

### Getting Financial Data

##### Getting the company_tickers table in DF

In [None]:
load_dotenv('.env')
#Get DB Params for Local DB
db_name = os.getenv('db_name')
db_user = os.getenv('db_user')
db_port = os.getenv('db_port')
db_host = os.getenv('db_host')
db_password = os.getenv('db_password')
conn = psycopg2.connect(f"dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}")
query = 'SELECT * FROM company_ticker'
cur = conn.cursor()
cur.execute(query)
data = cur.fetchall()
columns = [desc[0] for desc in cur.description]
df = pd.DataFrame(data, columns=columns)
df #this is the company_tickers database in df

Unnamed: 0,symbol,company_name
0,A,"Agilent Technologies, Inc. Common Stock"
1,AA,Alcoa Corporation Common Stock
2,AAA,Alternative Access First Priority CLO Bond ETF
3,AAAU,Goldman Sachs Physical Gold ETF Shares
4,AACT,Ares Acquisition Corporation II Class A Ordina...
...,...,...
14113,ZYME,Zymeworks Inc. - Common Stock
14114,ZYXI,"Zynex, Inc. - Common Stock"
14115,ZZZ,Cyber Hornet S&P 500 and Bitcoin 75/25 Strateg...
14116,TRUE,"TrueCar, Inc. - Common Stock"


#### Getting the unique companys from our esg_text_table

In [27]:
query = 'SELECT DISTINCT company FROM esg_text_table'
cur = conn.cursor()
cur.execute(query)
res = list(cur.fetchall())
result_list = [row[0] for row in res]
result_list

['Applied Materials',
 'Soltec',
 'Pfizer',
 'MPMaterials',
 'SASOL',
 'Morgan Stanley',
 'Infosys',
 'Infopulse',
 'petrobras',
 'Citibank',
 'TechnologyOne',
 'Marvell',
 'Mencast',
 'Genex',
 'Bioceres',
 'DBS',
 'Lenovo',
 'DataDog',
 'SPX Flow',
 'Thong Guan',
 'Apple',
 'RioTinto',
 'Enel',
 'National Bank of Kuwait-Egypt',
 'NorthVolt',
 'Nordson',
 'WiseTech Global',
 'Capgemini',
 'Sea',
 'ANZ',
 '3M',
 'Razer',
 'Itau',
 'Dangote Cement',
 'WEG',
 'LG Electronics',
 'Origin',
 'N-iX',
 'IBM',
 'Bank Of China',
 'Johnson Controls',
 'Hanwha Solutions']

#### Getting the ticker symbols

In [28]:
symbols = []
for i in result_list:
    symbols.append(get_ticker_symbol(i,df))

In [29]:
symbols

['AMAT',
 '7ST.SG',
 'PFE',
 'MP',
 'SSL',
 'MS',
 'INFY',
 None,
 'PBR',
 'C',
 'REW',
 'MRVL',
 '5NF.SI',
 '9820.T',
 'BIOX',
 'D05.SI',
 'LNVGF',
 'DDOG',
 'FLOW',
 '7034.KL',
 'AAPL',
 'RIO',
 'ENIC',
 'EGS60171C013-EGP.CA',
 None,
 'NDSN',
 'WTC.AX',
 'CGEMY',
 'SEA',
 'ANZ.NZ',
 'MMM',
 None,
 'ITUB',
 None,
 'WGNR',
 '066570.KS',
 'ORGN',
 '0P0000A2DS.SW',
 'IBM',
 '3988.HK',
 'JCI',
 '009830.KS']

#### Getting the Non-Private Companies (No Info available for private companies)

In [30]:
company_ticker = pd.DataFrame({'symbol': symbols, 'name': result_list})
company_ticker = company_ticker[company_ticker['symbol'].notna()] #available tickers

#### Past 10 yrs stock prices for companies

In [38]:
for index,row in company_ticker.iterrows():
    ticker = row['symbol']
    company = row['name']
    stocks = get_stocks(ticker,company) #returns df
    if stocks is None:
        continue
    insert_stocks(stocks)

#### Past ROA-ROE

In [41]:
for index,row in company_ticker.iterrows():
    ticker = row['symbol']
    company = row['name']
    roa_roe = get_roa_roe(ticker,company)
    if roa_roe is None:
        continue
    insert_roa_roe(roa_roe)

#### esg_Rag_table

In [1]:
import os
os.chdir('..')
from tqdm import tqdm
from db.scripts.db_esg_rag_table import insert_esg_rag_table
from db.scripts.batch_data_prepare_esg_rag_table import batch_data_prepare_esg_rag_table
import pandas as pd
from db.scripts.db_esg_rag_table_batch import insert_esg_rag_table_batch
##Example we use the csv given
df = pd.read_csv('./csv/esg_rag_data.csv')
batch_rag = batch_data_prepare_esg_rag_table(df, 100) #100 refers to batch size
batch_rag[0][0]

Prepare batches: 100%|█████████████████████████████████████| 69/69 [00:00<00:00, 14820.86document/s]


('DBS',
 2023,
 'Total Greenhouse Gas Emissions',
 "{'Total Greenhouse Gas Emissions': 0.46554}",
 0.46554)

In [2]:
batch_rag[0][4]

('UOBGROUP',
 2023,
 'Total Energy consumption',
 "{'Total Energy consumption': <NA>}",
 None)

In [2]:
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor: #allows for parallel processing
    list(tqdm(executor.map(insert_esg_rag_table_batch,batch_rag), total=len(batch_rag), desc='Insert batches into DB', unit='batch', ncols=100))

Insert batches into DB: 100%|██████████████████████████████████████| 1/1 [00:00<00:00,  1.58batch/s]


single insertion

In [2]:
insert_esg_rag_table(df)

company                                                     DBS
year                                                       2023
topic                            Total Greenhouse Gas Emissions
extracted_values    {'Total Greenhouse Gas Emissions': 0.46554}
final_score                                             0.46554
Name: 0, dtype: object
company                                            UOBGROUP
year                                                   2023
topic                        Total Greenhouse Gas Emissions
extracted_values    {'Total Greenhouse Gas Emissions': 0.5}
final_score                                             0.5
Name: 1, dtype: object
company                                                OCBC
year                                                   2023
topic                        Total Greenhouse Gas Emissions
extracted_values    {'Total Greenhouse Gas Emissions': 0.0}
final_score                                             0.0
Name: 2, dtype: object
company    

##### misc

In [11]:
## cleaning all_Company csv

df = pd.read_csv('csv/all_company.csv')
df.drop_duplicates().reset_index()
df.to_csv('all_company.csv', index=False)

In [12]:
df

Unnamed: 0,symbol,company_name
0,A,"Agilent Technologies, Inc. Common Stock"
1,AA,Alcoa Corporation Common Stock
2,AAA,Alternative Access First Priority CLO Bond ETF
3,AAAU,Goldman Sachs Physical Gold ETF Shares
4,AACT,Ares Acquisition Corporation II Class A Ordina...
...,...,...
14113,ZYME,Zymeworks Inc. - Common Stock
14114,ZYXI,"Zynex, Inc. - Common Stock"
14115,ZZZ,Cyber Hornet S&P 500 and Bitcoin 75/25 Strateg...
14116,TRUE,"TrueCar, Inc. - Common Stock"
