#### DB Function to insert esg_text_table

In [59]:
import os
os.chdir('..') #go to dsa3101 folder as main

In [60]:
import psycopg2
import pandas as pd
from db.scripts.db_esg_text import insert_esg_text
df = pd.read_csv("./files/labeled_pdfs_1003.csv")

In [61]:
from tqdm import tqdm

#### Batch prepare esg_text and batch insertion

In [4]:
def batch_data_prepare_esg_text(df, batch_size):
    batch_data = [] #batch of data to append
    batches = [] #index of batches
    
    #batch data_preparation
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Prepare batches", unit="document", leave=True, ncols=100):
        batch_data.append((
            row['company'],
            int(row['year']),
            row['country'],
            row['industry'],
            row['esg_text'],
            row['labels']
        )) #appends a row to batch_data in tuple format for batch format

        if len(batch_data) >= batch_size: #eg 100-200?
            batches.append(batch_data)
            batch_data = [] #reset batch
    
    # Append leftovers as above code doesnt account for it
    batches.append(batch_data)
    return batches

batch = batch_data_prepare_esg_text(df,200)

Prepare batches:   0%|                                              | 0/63903 [00:00<?, ?document/s]

Prepare batches: 100%|███████████████████████████████| 63903/63903 [00:03<00:00, 19185.97document/s]


#### Insert into SupaBase

In [6]:
from concurrent.futures import ProcessPoolExecutor
from db.scripts.db_esg_text_batch import insert_esg_text_batch
with ProcessPoolExecutor() as executor: #allows for parallel processing
    list(tqdm(executor.map(insert_esg_text_batch,batch), total=len(batch), desc='Insert batches into DB', unit='batch', ncols=100))


Insert batches into DB: 100%|██████████████████████████████████| 320/320 [01:31<00:00,  3.49batch/s]


#### Single ESG_Text_Insert(Small Data)

In [None]:
insert_esg_text(df)

#### Insert into vectorDB in chromaDB format

In [None]:
# ASSUME THIS OCCURS
# WE STORE THE IDS, DOCUMENTS, METADATAS INTO A DB AND LOAD IT LATER TO THE CLIENT


#  client = chromadb.PersistentClient(path="./chromadb_1003")  # Stores DB in ./chroma_db
# collection = client.get_or_create_collection(name="dsa3101")
# logging.basicConfig(level=logging.WARNING)

# for index, row in tqdm(df.iterrows(), total=len(df), desc="Adding documents", unit="document", leave=True, ncols=100):
#     doc_text = row["esg_text"]  
#     doc_company = row["company"]  
#     doc_year = row["year"]  
#     doc_industry = row["industry"]
#     doc_id = f"doc_{index}"  

#     collection.add(
#         ids=[doc_id], 
#         documents=[doc_text],  
#         metadatas=[{"company": doc_company, "year": doc_year}] 
#     )

In [4]:
from tqdm import tqdm
from db.scripts.db_esg_vectorDB_batch import insert_esg_vectorDB_batch
import json
from concurrent.futures import ProcessPoolExecutor #Parallel Processing to speed up
import json

In [None]:
def batch_data_prepare_chromaDB(df, batch_size):
    batch_data = [] #batch of data to append
    batches = [] #index of batches
    
    #batch data_preparation, same as batch_data_prepare_esg
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Preparing batches", unit="document", leave=True, ncols=100):
        doc_text = row["esg_text"]
        doc_company = row["company"]
        doc_year = int(row["year"])
        doc_id = f"doc_{index}"

        metadatas = json.dumps({
            "company": doc_company,
            "year": doc_year,
        })

        batch_data.append((doc_id, doc_text, metadatas))

        if len(batch_data) >= batch_size:
            batches.append(batch_data)
            batch_data = []

    if batch_data:
        batches.append(batch_data)
    return batches

batch = batch_data_prepare_chromaDB(df,200)

Preparing batches: 100%|█████████████████████████████| 63903/63903 [00:03<00:00, 18363.03document/s]


#### insert into vectorDB in a batch

In [65]:
from db.scripts.db_esg_vectorDB_batch import insert_esg_vectorDB_batch

In [66]:
with ProcessPoolExecutor() as executor:
    list(tqdm(executor.map(insert_esg_vectorDB_batch, batch), total=len(batch), desc="Inserting batches", unit="batch", ncols=100))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

#### pgVector search

In [8]:
from dotenv import load_dotenv
import os
import psycopg2
from tqdm import tqdm

In [9]:
load_dotenv('.env')
#Get DB Params for Local DB
db_name = os.getenv('db_name')
db_user = os.getenv('db_user')
db_port = os.getenv('db_port')
db_host = os.getenv('db_host')
db_password = os.getenv('db_password')
conn = psycopg2.connect(f"dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}")
cur = conn.cursor()

In [15]:
query = "Retrieve percentage of reduction in Greenhouse gas emissions during the reporting year in the company. This can be in a) Total reduction, b) Scope 1 reduction and c) Scope 2 reduction"

In [1]:
#embedding model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [2]:
from langchain_postgres import PGVector

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="test",
    connection="postgresql+psycopg2://postgres:123@localhost:5432/postgres",
)

In [10]:
from langchain_core.documents import Document

#### batch prepare pgVector ##Not as good as chromaDB

In [11]:
def batch_data_prepare_pgVector(df, batch_size):
    batch_data = [] #batch of data to append
    batches = [] #index of batches
    
    #batch data_preparation, same as batch_data_prepare_esg
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Preparing batches", unit="document", leave=True, ncols=100):
        doc_text = row["esg_text"]
        doc_company = row["company"]
        doc_year = int(row["year"])
        doc_id = index

        metadatas = {
            "id": doc_id,
            "company": doc_company,
            "year": doc_year,
        }

        batch_data.append(Document(page_content=doc_text, metadata=metadatas))

        if len(batch_data) >= batch_size:
            batches.append(batch_data)
            batch_data = []

    if batch_data:
        batches.append(batch_data)
    return batches

batch = batch_data_prepare_pgVector(df,200)

Preparing batches: 100%|█████████████████████████████| 63903/63903 [00:03<00:00, 16506.63document/s]


In [12]:
def process_batch_vector(batch):
    vector_store = PGVector(
        embeddings=embeddings,
        collection_name="test",
        connection="postgresql+psycopg2://postgres:123@localhost:5432/postgres",
    )

    vector_store.add_documents(batch)


In [13]:
for b in batch:
    process_batch_vector(b)

In [None]:
from concurrent.futures import ThreadPoolExecutor
## not faster than using CPU for processing but its faster to do this way than chromaDB which took 40mins
with ThreadPoolExecutor() as executor: #allows for parallel processing in cpu
    list(tqdm(executor.map(process_batch_vector,batch), total=len(batch), desc='Insert batches into DB', unit='batch', ncols=100))

Insert batches into DB:  19%|██████▌                            | 60/320 [10:50<46:58, 10.84s/batch]


In [43]:
results = vector_store.search(
    query=query, 
    filter={"company": "Apple", "year": 2022},
    search_type='similarity'
)

In [44]:
results

[Document(id='0725c4e9-3e49-48df-a26e-c2e84bcd6b07', metadata={'id': 51401, 'year': 2022, 'company': 'Apple'}, page_content='2,780  Scope 3 (gross emissions)* 23,130,000  Business travel®  Employee commute®  Corporate carbon offsets’  Product life  cycle emissions®  (metric tons COze) Manufacturing  (purchased goods  and services)  Product transportation  (upstream and downstream)  Product use  (use of sold products)  End-of-life treatment  Product carbon offsets?'),
 Document(id='343ea235-a9ba-433a-8168-df9f555ea4b6', metadata={'id': 51420, 'year': 2022, 'company': 'Apple'}, page_content='When using the  same level of data granularity and model as 2021, our product use carbon  emissions in 2021 would have been about 2.5 percent lower.'),
 Document(id='988b4cb1-99a0-460e-824d-229241719863', metadata={'id': 51596, 'year': 2022, 'company': 'Apple'}, page_content='Scope 3 greenhouse gas  emissions related to our products, calculated Customers Communities Governance Appendix  using life cy

#### chromaDB getting from huggingFace download

In [32]:
from huggingface_hub import HfApi
from datasets import load_dataset
from huggingface_hub import snapshot_download

dataset = 'alexxtm/3101_proj_chromaDB'


snapshot_download(local_dir="./test", repo_id=dataset, repo_type='dataset')


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

header.bin:   0%|          | 0.00/100 [00:00<?, ?B/s]

length.bin:   0%|          | 0.00/252k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.51k [00:00<?, ?B/s]

link_lists.bin:   0%|          | 0.00/531k [00:00<?, ?B/s]

data_level0.bin:   0%|          | 0.00/106M [00:00<?, ?B/s]

chroma.sqlite3:   0%|          | 0.00/107M [00:00<?, ?B/s]

index_metadata.pickle:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

'/home/shiro/dsa3101_v2/dsa3101/test'

In [38]:
import chromadb
client = chromadb.PersistentClient(path="./chromatest")  # Stores DB in ./chroma_db
collection = client.get_or_create_collection(name="dsa3101")

#### chromaDB

In [None]:
query = "Retrieve percentage of reduction in Greenhouse gas emissions during the reporting year in the company. This can be in a) Total reduction, b) Scope 1 reduction and c) Scope 2 reduction"
results = collection.query(
    query_texts=[query],
   where={
        "$and": [
            {"company": "Apple"},
            {"year": 2022}
        ]
    },
    n_results=5
)

In [40]:
results

{'ids': [['doc_51558', 'doc_51413', 'doc_51407', 'doc_51406', 'doc_51420']],
 'embeddings': None,
 'documents': [['—> Continue reading on page 13  Reduced overall  emissions by 40%  In fiscal year 2021, our environmental  initiatives avoided over 23 million metric  tons of emissions across all scopes, and  we reduced our carbon footprint by  40 percent compared with fiscal year  2015.',
   'Without the methodology  change, these emissions would have increased by 14 percent, which reflects  the growth in our business.',
   'In fiscal year 2017, we started calculating scope 3 emissions not listed in  this table.',
   "Beginning in FY2021, we're accounting for scope 2 emissions from the  purchase of district heating, chilled water, and steam.",
   'When using the  same level of data granularity and model as 2021, our product use carbon  emissions in 2021 would have been about 2.5 percent lower.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'company': 'Apple', 'year': 2022.0},
   {'co

In [17]:
pd.read_csv('country_regions.csv',index_col=0)

Unnamed: 0,country,region,subregion
0,Afghanistan,Asia,Southern Asia
1,Åland Islands,Europe,Northern Europe
2,Albania,Europe,Southern Europe
3,Algeria,Africa,Northern Africa
4,American Samoa,Oceania,Polynesia
...,...,...,...
244,Wallis and Futuna,Oceania,Polynesia
245,Western Sahara,Africa,Northern Africa
246,Yemen,Asia,Western Asia
247,Zambia,Africa,Sub-Saharan Africa


#### getting alpha vantage api

In [62]:
from dotenv import load_dotenv
load_dotenv('.env')

#Get DB Params for Local DB
# db_name = os.getenv('db_name')
# db_user = os.getenv('db_user')
# db_port = os.getenv('db_port')
# db_host = os.getenv('db_host')
# db_password = os.getenv('db_password')
# conn = psycopg2.connect(f"dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}")

## SupaBase DB ##
db_url = os.getenv('DATABASE_URL')
conn = psycopg2.connect(db_url)

In [67]:
query = 'SELECT DISTINCT company FROM esg_text_table'
cur = conn.cursor()
cur.execute(query)
res = list(cur.fetchall())
result_list = [row[0] for row in res]
result_list

['ApplieMaterials',
 'LG',
 'Soltec',
 'Pfizer',
 'MPMaterials',
 'SASOL',
 'Infosys',
 'Infopulse',
 'petrobras',
 'Citibank',
 'Marvell',
 'TechnologyOne',
 'Mencast',
 'Genex',
 'Bioceres',
 'DBS',
 'DataDog',
 'Lenovo',
 'Thong Guan',
 'Apple',
 'RioTinto',
 'JohnsonControl',
 'Morgan_Stanley',
 'Enel',
 'NationalBankofKuwait-Egypt',
 'Nordson',
 'Capgemini',
 'Sea',
 'Hanwha_Solutions',
 'ANZ',
 '3M',
 'Razer',
 'SPX_Flow',
 'Itau',
 'WiseTechGlobal',
 'BankofChina',
 'WEG',
 'NorthVold',
 'Origin',
 'N-iX',
 'IBM',
 'DangoteCemente']

#### Symbol Ticker Search ==> Company Name needs to be accurate

In [None]:
pip install rapidfuzz

In [None]:
import yahooquery as yq
def get_ticker_symbol(company_name):
    ## fuzzy matching => we collect names from the database and fuzzy match, if nothing exist then we use yq.search


    #clean company name
    company_name = company_name.replace('-', ' ')
    company_name = company_name.replace('_', ' ')
    data = yq.search(company_name)
    if data['quotes'] == []: #empty
        return None
    symbol = data['quotes'][0]['symbol'] ##get first result
    print(symbol)
    return symbol

In [70]:
symbols = []
for i in result_list:
    if get_ticker_symbol(i) != None:
        symbols.append(get_ticker_symbol(i))

LGND
LGND
7ST.SG
7ST.SG
PFE
PFE
SSL
SSL
INFY
INFY
PBR
PBR
C
C
MRVL
MRVL
5NF.SI
5NF.SI
9820.T
9820.T
BIOX
BIOX
D05.SI
D05.SI
DDOG
DDOG
0992.HK
0992.HK
7034.KL
7034.KL
AAPL
AAPL
MS
MS
ENEL.MI
ENEL.MI
NDSN
NDSN
CAP.PA
CAP.PA
SEA
SEA
009830.KS
009830.KS
ANZ.AX
ANZ.AX
MMM
MMM
FLOW
FLOW
ITUB
ITUB
WGNR
WGNR
ORGN
ORGN
0P0000A2DS.SW
0P0000A2DS.SW
IBM
IBM


In [71]:
symbols

['LGND',
 '7ST.SG',
 'PFE',
 'SSL',
 'INFY',
 'PBR',
 'C',
 'MRVL',
 '5NF.SI',
 '9820.T',
 'BIOX',
 'D05.SI',
 'DDOG',
 '0992.HK',
 '7034.KL',
 'AAPL',
 'MS',
 'ENEL.MI',
 'NDSN',
 'CAP.PA',
 'SEA',
 '009830.KS',
 'ANZ.AX',
 'MMM',
 'FLOW',
 'ITUB',
 'WGNR',
 'ORGN',
 '0P0000A2DS.SW',
 'IBM']

### get alpha vantage roa roe etc

In [None]:
import requests

def get_financial_data(company_name):
    load_dotenv()
    alpha_api = os.getenv('ALPHA_API_KEY')

    #get company ticker symbol, api#1
    url = f'https://www.alphavantage.co/query?function=SYMBOL_SEARCH&keywords={company_name}&apikey={alpha_api}'
    r = requests.get(url)
    data = r.json()
    if data['bestMatches'] == []: #if empty
        return None #dont continue no data means we cant scrape anything
    ticker_symbol = data['bestMatches'][0]['1. symbol']

    #getting the

In [77]:
import requests
load_dotenv()
alpha_api = os.getenv('ALPHA_API_KEY')
company = 'Infosys' #etc
url = f'https://www.alphavantage.co/query?function=SYMBOL_SEARCH&keywords={company}&apikey={alpha_api}'

r4 = requests.get(url)
data4 = r4.json()
data4

{'bestMatches': [{'1. symbol': 'INFY',
   '2. name': 'Infosys Ltd',
   '3. type': 'Equity',
   '4. region': 'United States',
   '5. marketOpen': '09:30',
   '6. marketClose': '16:00',
   '7. timezone': 'UTC-04',
   '8. currency': 'USD',
   '9. matchScore': '0.7778'},
  {'1. symbol': 'I1FO34.SAO',
   '2. name': 'Infosys Limited',
   '3. type': 'Equity',
   '4. region': 'Brazil/Sao Paolo',
   '5. marketOpen': '10:00',
   '6. marketClose': '17:30',
   '7. timezone': 'UTC-03',
   '8. currency': 'BRL',
   '9. matchScore': '0.6364'},
  {'1. symbol': 'INFY.BSE',
   '2. name': 'Infosys Limited',
   '3. type': 'Equity',
   '4. region': 'India/Bombay',
   '5. marketOpen': '09:15',
   '6. marketClose': '15:30',
   '7. timezone': 'UTC+5.5',
   '8. currency': 'INR',
   '9. matchScore': '0.6364'},
  {'1. symbol': 'IOY.FRK',
   '2. name': 'Infosys Limited',
   '3. type': 'Equity',
   '4. region': 'Frankfurt',
   '5. marketOpen': '08:00',
   '6. marketClose': '20:00',
   '7. timezone': 'UTC+02',
   '8

In [None]:
import requests
load_dotenv()
alpha_api = os.getenv('ALPHA_API_KEY')
company = 'Infosys' #etc
url = 'https://www.alphavantage.co/query?function=SYMBOL_SEARCH&keywords={company}&apikey={alpha_api}'\

r = requests.get(url)
data = r.json()

print(data)

{'symbol': 'INFY', 'annualReports': [{'fiscalDateEnding': '2024-03-31', 'reportedCurrency': 'USD', 'totalAssets': '16523000000', 'totalCurrentAssets': '10722000000', 'cashAndCashEquivalentsAtCarryingValue': '1773000000', 'cashAndShortTermInvestments': '1773000000', 'inventory': '43000000', 'currentNetReceivables': 'None', 'totalNonCurrentAssets': '5801000000', 'propertyPlantEquipment': 'None', 'accumulatedDepreciationAmortizationPPE': 'None', 'intangibleAssets': '167000000', 'intangibleAssetsExcludingGoodwill': '167000000', 'goodwill': '875000000', 'investments': 'None', 'longTermInvestments': '1538000000', 'shortTermInvestments': '1660000000', 'otherCurrentAssets': '853000000', 'otherNonCurrentAssets': 'None', 'totalLiabilities': '5918000000', 'totalCurrentLiabilities': '4651000000', 'currentAccountsPayable': 'None', 'deferredRevenue': 'None', 'currentDebt': 'None', 'shortTermDebt': '235810558', 'totalNonCurrentLiabilities': '1267000000', 'capitalLeaseObligations': '1002000000', 'long

In [54]:
url = 'https://www.alphavantage.co/query?function=BALANCE_SHEET&symbol=INFY&apikey={alpha_api}'
r = requests.get(url)
data = r.json()

print(data)

{'symbol': 'INFY', 'annualReports': [{'fiscalDateEnding': '2024-03-31', 'reportedCurrency': 'USD', 'totalAssets': '16523000000', 'totalCurrentAssets': '10722000000', 'cashAndCashEquivalentsAtCarryingValue': '1773000000', 'cashAndShortTermInvestments': '1773000000', 'inventory': '43000000', 'currentNetReceivables': 'None', 'totalNonCurrentAssets': '5801000000', 'propertyPlantEquipment': 'None', 'accumulatedDepreciationAmortizationPPE': 'None', 'intangibleAssets': '167000000', 'intangibleAssetsExcludingGoodwill': '167000000', 'goodwill': '875000000', 'investments': 'None', 'longTermInvestments': '1538000000', 'shortTermInvestments': '1660000000', 'otherCurrentAssets': '853000000', 'otherNonCurrentAssets': 'None', 'totalLiabilities': '5918000000', 'totalCurrentLiabilities': '4651000000', 'currentAccountsPayable': 'None', 'deferredRevenue': 'None', 'currentDebt': 'None', 'shortTermDebt': '235810558', 'totalNonCurrentLiabilities': '1267000000', 'capitalLeaseObligations': '1002000000', 'long

In [61]:
url2 = 'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol=INFY&apikey={alpha_api}'
r2 = requests.get(url2)
data2 = r2.json()

print(data2)

{'symbol': 'INFY', 'annualReports': [{'fiscalDateEnding': '2024-03-31', 'reportedCurrency': 'USD', 'grossProfit': '5466000000', 'totalRevenue': '18562000000', 'costOfRevenue': '13096000000', 'costofGoodsAndServicesSold': '13096000000', 'operatingIncome': '3834000000', 'sellingGeneralAndAdministrative': '790000000', 'researchAndDevelopment': '135037609', 'operatingExpenses': '1632000000', 'investmentIncomeNet': 'None', 'netInterestIncome': '431000000', 'interestIncome': '487000000', 'interestExpense': '56000000', 'nonInterestIncome': 'None', 'otherNonOperatingIncome': 'None', 'depreciation': 'None', 'depreciationAndAmortization': '372000000', 'incomeBeforeTax': '4346000000', 'incomeTaxExpense': '1177000000', 'interestAndDebtExpense': 'None', 'netIncomeFromContinuingOperations': '3169000000', 'comprehensiveIncomeNetOfTax': 'None', 'ebit': '3834000000', 'ebitda': '4206000000', 'netIncome': '3167000000'}, {'fiscalDateEnding': '2023-03-31', 'reportedCurrency': 'USD', 'grossProfit': '5503000

In [58]:
data

{'symbol': 'INFY',
 'annualReports': [{'fiscalDateEnding': '2024-03-31',
   'reportedCurrency': 'USD',
   'totalAssets': '16523000000',
   'totalCurrentAssets': '10722000000',
   'cashAndCashEquivalentsAtCarryingValue': '1773000000',
   'cashAndShortTermInvestments': '1773000000',
   'inventory': '43000000',
   'currentNetReceivables': 'None',
   'totalNonCurrentAssets': '5801000000',
   'propertyPlantEquipment': 'None',
   'accumulatedDepreciationAmortizationPPE': 'None',
   'intangibleAssets': '167000000',
   'intangibleAssetsExcludingGoodwill': '167000000',
   'goodwill': '875000000',
   'investments': 'None',
   'longTermInvestments': '1538000000',
   'shortTermInvestments': '1660000000',
   'otherCurrentAssets': '853000000',
   'otherNonCurrentAssets': 'None',
   'totalLiabilities': '5918000000',
   'totalCurrentLiabilities': '4651000000',
   'currentAccountsPayable': 'None',
   'deferredRevenue': 'None',
   'currentDebt': 'None',
   'shortTermDebt': '235810558',
   'totalNonCurre

In [62]:
data2

{'symbol': 'INFY',
 'annualReports': [{'fiscalDateEnding': '2024-03-31',
   'reportedCurrency': 'USD',
   'grossProfit': '5466000000',
   'totalRevenue': '18562000000',
   'costOfRevenue': '13096000000',
   'costofGoodsAndServicesSold': '13096000000',
   'operatingIncome': '3834000000',
   'sellingGeneralAndAdministrative': '790000000',
   'researchAndDevelopment': '135037609',
   'operatingExpenses': '1632000000',
   'investmentIncomeNet': 'None',
   'netInterestIncome': '431000000',
   'interestIncome': '487000000',
   'interestExpense': '56000000',
   'nonInterestIncome': 'None',
   'otherNonOperatingIncome': 'None',
   'depreciation': 'None',
   'depreciationAndAmortization': '372000000',
   'incomeBeforeTax': '4346000000',
   'incomeTaxExpense': '1177000000',
   'interestAndDebtExpense': 'None',
   'netIncomeFromContinuingOperations': '3169000000',
   'comprehensiveIncomeNetOfTax': 'None',
   'ebit': '3834000000',
   'ebitda': '4206000000',
   'netIncome': '3167000000'},
  {'fisca

In [93]:
test = [
    {'fiscalDateEnding': i['fiscalDateEnding'], 'totalAssets': int(i['totalAssets']), 'totalShareholderEquity': int(i['totalShareholderEquity'])}
    for i in data['annualReports']
]

In [91]:
test2 = [
    int(i['netIncome'])
    for i in data2['annualReports']
]

In [97]:
df4 =pd.DataFrame(test2)

In [103]:
df4[0]

0     3167000000
1     2981000000
2     2963000000
3     2613000000
4     2331000000
5     2199000000
6     2486000000
7     2140000000
8     2052000000
9     2013000000
10    1751000000
11    1725000000
12    1716000000
13    1499000000
14    1313000000
15    1281000000
16    1155000000
17     850000000
18     555000000
19     419000000
Name: 0, dtype: int64

In [98]:
df3 = pd.DataFrame(test)

In [105]:
df3['totalAssets'] / df4[0]

0     5.217240
1     5.136531
2     5.249747
3     5.673555
4     5.259545
5     5.571623
6     4.929606
7     6.006542
8     5.544834
9     5.273224
10    5.438035
11    4.950145
12    4.392191
13    4.676451
14    4.683930
15    3.416081
16    3.889177
17    3.615294
18    3.722523
19    3.470167
dtype: float64

In [92]:
test2 #net incomes

[3167000000,
 2981000000,
 2963000000,
 2613000000,
 2331000000,
 2199000000,
 2486000000,
 2140000000,
 2052000000,
 2013000000,
 1751000000,
 1725000000,
 1716000000,
 1499000000,
 1313000000,
 1281000000,
 1155000000,
 850000000,
 555000000,
 419000000]

In [38]:
import yfinance as yf
import pandas as pddata

stock = yf.Ticker('INFY')

In [None]:
#monthly_stock_prices
monthly_stock = stock.history(period="5y", interval="1mo")["Close"]

In [50]:
#quarterly financial reports
income_statements = stock.quarterly_financials
balance_sheets = stock.quarterly_balancesheet

In [53]:
balance_sheets

Unnamed: 0,2024-12-31,2024-09-30,2024-06-30,2024-03-31,2023-12-31,2023-09-30,2023-06-30
Treasury Shares Number,10187113.0,10237261.0,10246512.0,10916829.0,11249465.0,,
Ordinary Shares Number,4142082081.0,4141909556.0,4141781963.0,4139950635.0,4139198089.0,,
Share Issued,4152269194.0,4152146817.0,4152028475.0,4150867464.0,4150447554.0,,
Total Debt,667000000.0,756000000.0,740000000.0,1002000000.0,802000000.0,,
Tangible Book Value,8799000000.0,9185000000.0,8901000000.0,9517000000.0,8542000000.0,,
...,...,...,...,...,...,...,...
Cash Cash Equivalents And Short Term Investments,3596000000.0,3488000000.0,3022000000.0,3433000000.0,2598000000.0,,
Other Short Term Investments,933000000.0,887000000.0,1051000000.0,1660000000.0,958000000.0,,
Cash And Cash Equivalents,2663000000.0,2601000000.0,1971000000.0,1773000000.0,1640000000.0,,
Cash Equivalents,,,,0.0,,1000000.0,238000000.0


In [40]:
data

{'explains': [],
 'count': 15,
 'quotes': [{'exchange': 'NYQ',
   'shortname': 'Petroleo Brasileiro S.A.- Petro',
   'quoteType': 'EQUITY',
   'symbol': 'PBR',
   'index': 'quotes',
   'score': 20524.0,
   'typeDisp': 'Equity',
   'longname': 'Petróleo Brasileiro S.A. - Petrobras',
   'exchDisp': 'NYSE',
   'sector': 'Energy',
   'sectorDisp': 'Energy',
   'industry': 'Oil & Gas Integrated',
   'industryDisp': 'Oil & Gas Integrated',
   'dispSecIndFlag': True,
   'isYahooFinance': True},
  {'exchange': 'NYQ',
   'shortname': 'Petroleo Brasileiro S.A.- Petro',
   'quoteType': 'EQUITY',
   'symbol': 'PBR-A',
   'index': 'quotes',
   'score': 20160.0,
   'typeDisp': 'Equity',
   'longname': 'Petróleo Brasileiro S.A. - Petrobras',
   'exchDisp': 'NYSE',
   'sector': 'Energy',
   'sectorDisp': 'Energy',
   'industry': 'Oil & Gas Integrated',
   'industryDisp': 'Oil & Gas Integrated',
   'isYahooFinance': True},
  {'exchange': 'SAO',
   'shortname': 'PETROBRAS   PN      N2',
   'quoteType': 