In [1]:
import pandas as pd
import yfinance as yf
from datetime import datetime
from gdeltdoc import GdeltDoc, Filters, near, repeat
from newspaper import Article, ArticleException
import feedparser
import requests
from urllib.parse import urlencode

In [2]:
def get_articles_google(company:str, query:str, topic:str) -> pd.DataFrame:
    query_url = urlencode({'q': company + query})
    url = "https://news.google.com/rss/search?" + query_url
    resp = requests.get(url)
    news = feedparser.parse(resp.text)
    all_news = []
    for news_ent in news.entries:
        title = news_ent.title
        link_url = news_ent.links[0].href
        lang = news_ent.title_detail.language
        domain = news_ent.source.href
        id = news_ent.id
        published = news_ent.published
        all_news.append([title, link_url, lang, domain, published, query, company, topic, company + " " + query])
    all_news_df = pd.DataFrame(all_news, columns=['title', 'url', 'language', 'domain', 'date', 'query', 'company', 'topic', 'company_query'])
    return all_news_df

def crawl_google(companies:list, esg_dict:dict) -> pd.DataFrame:
        print("GOOGLE CRAWL STARTED")
        all_new_dataset_google = []
        for company in companies:
            for topic, query_list in esg_dict.items():
                for query in query_list:
                    temp_df = get_articles_google(company=company, query=query, topic=topic)
                    all_new_dataset_google.append(temp_df)
        all_new_dataset_google_df = pd.concat(all_new_dataset_google)
        
        date_google = [datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z') for date in all_new_dataset_google_df['date'].values.tolist()]
        all_new_dataset_google_df['date'] = date_google
        all_new_dataset_google_df['date'] = all_new_dataset_google_df['date'].astype(str)
        all_new_dataset_google_df['crawl_source'] = 'google'
        
        return all_new_dataset_google_df

In [4]:
from Static import taxonomy_new
esg_queries = taxonomy_new.esg_dict

In [5]:
# Chevron
# Exxon
# British Petroleum
# Shell

# Apple
# Micorsoft
# Tesla
# Google

# Bank of America
# Goldman Sachs
# Jp Morgan
# Morgan Stanley

companies_oil = ['Chevron', 'Exxon', 'British Petroleum', 'Shell']
companies_tech = ['Apple', 'Micorsoft', 'Tesla', 'Google']
companies_bank = ['Bank of America', 'Goldman Sachs', 'Jp Morgan', 'Morgan Stanley']

In [None]:
oil_news = crawl_google(companies_oil, esg_queries)
tech_news = crawl_google(companies_tech, esg_queries)
bank_news = crawl_google(companies_bank, esg_queries)

In [None]:
oil_news.to_csv('oil_news.csv')
tech_news.to_csv('tech_news.csv')
bank_news.to_csv('bank_news.csv')

## Get text from news

In [2]:
def get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article

In [5]:
def process_for_label(df):
    # good_values = df['query'].value_counts()[df['query'].value_counts() > 2].index.tolist()
    # df_label = df[df['query'].isin(good_values)].groupby('query').apply(lambda x: x.sample(n=2))
    url_texts = []
    for url in df['url'].values.tolist():    
        try:
            text = get_article(url).text
            if text:
                url_texts.append(text)
        except ArticleException:
            print(f"  Couldn't download article at url {url}")
    save_df = pd.DataFrame(url_texts, columns=['text'])
    return save_df

In [6]:
df = pd.read_csv('./data/news/oil_news.csv')
save_df = process_for_label(df)
save_df.to_csv('./data/news/oil_label_test.csv')

  Couldn't download article at url https://news.google.com/rss/articles/CBMiigFodHRwczovL3RoZWhpbGwuY29tL3BvbGljeS9lbmVyZ3ktZW52aXJvbm1lbnQvMzc4MTkzMy1jb250cm92ZXJzaWFsLWFjdGl2aXN0LXN0ZXZlLWRvbnppZ2VyLWlzLWEtZm9say1oZXJvLXRvLXRoZS1sZWZ0LWEtZnJhdWQtdG8tYmlnLW9pbC_SAY4BaHR0cHM6Ly90aGVoaWxsLmNvbS9wb2xpY3kvZW5lcmd5LWVudmlyb25tZW50LzM3ODE5MzMtY29udHJvdmVyc2lhbC1hY3RpdmlzdC1zdGV2ZS1kb256aWdlci1pcy1hLWZvbGstaGVyby10by10aGUtbGVmdC1hLWZyYXVkLXRvLWJpZy1vaWwvYW1wLw?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMikAFodHRwczovL3RoZWhpbGwuY29tL2hvbWVuZXdzL2hvdXNlLzM3NzkzMDUtcmVwb3J0LWRldGFpbGluZy1ncmVlbndhc2hpbmctYWRzLW9uLWdvb2dsZS1wcm9tcHRzLWRlbW9jcmF0aWMtbGF3bWFrZXJzLXRvLXdyaXRlLXNlYXJjaC1lbmdpbmVzLWNlby_SAZQBaHR0cHM6Ly90aGVoaWxsLmNvbS9ob21lbmV3cy9ob3VzZS8zNzc5MzA1LXJlcG9ydC1kZXRhaWxpbmctZ3JlZW53YXNoaW5nLWFkcy1vbi1nb29nbGUtcHJvbXB0cy1kZW1vY3JhdGljLWxhd21ha2Vycy10by13cml0ZS1zZWFyY2gtZW5naW5lcy1jZW8vYW1wLw?oc=5
  Couldn't download article at url https://news.google.c



  Couldn't download article at url https://news.google.com/rss/articles/CBMiVmh0dHBzOi8vd3d3LnZhbmd1YXJkbmdyLmNvbS8yMDE2LzAzL2VrcGFuLWNyaXNpcy1mb3JnaXZlLXVzLWNvbW11bml0eS1oZWFkLWJlZ3Mtb2tvd2Ev0gFaaHR0cHM6Ly93d3cudmFuZ3VhcmRuZ3IuY29tLzIwMTYvMDMvZWtwYW4tY3Jpc2lzLWZvcmdpdmUtdXMtY29tbXVuaXR5LWhlYWQtYmVncy1va293YS9hbXAv?oc=5




  Couldn't download article at url https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LmFhLmNvbS50ci9lbi9hc2lhLXBhY2lmaWMvdXMtZmlybS1jaGV2cm9uLW9mZmVyaW5nLWxpZmVsaW5lLXRvLW15YW5tYXItanVudGEvMjIyNjYzONIBAA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMicWh0dHBzOi8vdGhlaGlsbC5jb20vaGlsbHR2L3Jpc2luZy81NTczODgtc3RldmVuLWRvbnppZ2VyLXNheXMtY2hldnJvbi1sYXd5ZXItd29ya2luZy1mb3ItbnktdGltZXMtaXMtY29uZmxpY3Qtb2Yv0gEA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMiU2h0dHBzOi8vd3d3LnN1bmhlcmFsZC5jb20vbmV3cy9sb2NhbC9jb3VudGllcy9oYXJyaXNvbi1jb3VudHkvYXJ0aWNsZTIzMjE1MTk0Ny5odG1s0gEA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMiTGh0dHBzOi8vd3d3LmNvdXJ0aG91c2VuZXdzLmNvbS9jaGV2cm9uLW1vbnRlcmV5LWNvdW50eS1pbi1zdGFuZG9mZi1vdmVyLW9pbC_SAQA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMiqwFodHRwczovL3d3dy53c2ouY29tL2FydGljbGVzL3NoYXJlaG9sZGVycy1zdGFuZC11cC1mb3ItcHJvZml



  Couldn't download article at url https://news.google.com/rss/articles/CBMiswFodHRwczovL3d3dy55b3Jrc2hpcmVwb3N0LmNvLnVrL3Nwb3J0L2Zvb3RiYWxsL21hbmNoZXN0ZXItdW5pdGVkL3doby1pcy1zaXItamltLXJhdGNsaWZmZS10aGUteW9ya3NoaXJlLWVkdWNhdGVkLWJpbGxpb25haXJlLXdoby1pcy1pbi10aGUtcnVubmluZy10by1idXktbWFuY2hlc3Rlci11bml0ZWQtMzk5MTgzOdIBAA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMiUmh0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FydGljbGVzL2V4eG9uLW1vYmlsLXN0b2NrLXByaWNlLWFuYWx5c3Qtd3JvbmctNTE2NDI2MTEwMTbSAQA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMikwFodHRwczovL3d3dy5uYXRpb25hbHdvcmxkLmNvbS9uZXdzL3Blb3BsZS9ob3ctZGlkLXNpci1qaW0tcmF0Y2xpZmZlLW1ha2UtaGlzLW1vbmV5LWluZW9zLWNlby1uZXQtd29ydGgtMjAyMi1tYW4tdXRkLWxpbmtzLXJpY2hlc3QtcGVyc29uLXVrLTM4MTA4ODDSAQA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMimAFodHRwczovL3d3dy50aGVtZXJjdXJ5LmNvbS5hdS9saWZlc3R5bGUvdGFzc2llLWZpbG1tYWtlcnMtZG9jby1hLXBsYXN0aWMtb2NlYW4taGlnaG

In [6]:
df = pd.read_csv('./data/news/tech_news.csv')
save_df = process_for_label(df)
save_df.to_csv('./data/news/tech_label.csv')

  Couldn't download article at url https://news.google.com/rss/articles/CBMiTGh0dHBzOi8vc2NyZWVucmFudC5jb20vZ29vZ2xlLWFjY291bnQtaW5hY3RpdmUtZW1haWwtd2FybmluZy1zY2FtLWV4cGxhaW5lZC_SAQA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMiaWh0dHBzOi8vbmV3cy5zb2Z0cGVkaWEuY29tL25ld3MvR29vZ2xlLURlYnV0cy1HbG9iYWwtRGVmb3Jlc3RhdGlvbi1NYXAtdGhlLUZpcnN0LW9mLUl0cy1LaW5kLTQwMDUxNi5zaHRtbNIBAA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMicmh0dHBzOi8vd3d3Lndzai5jb20vYXJ0aWNsZXMvdGVjaC1jeWJlci1jb21wYW5pZXMtbGF1bmNoLXNlY3VyaXR5LXN0YW5kYXJkLXRvLW1vbml0b3ItaGFja2luZy1hdHRlbXB0cy0xMTY2MDEyMzgwMtIBdmh0dHBzOi8vd3d3Lndzai5jb20vYW1wL2FydGljbGVzL3RlY2gtY3liZXItY29tcGFuaWVzLWxhdW5jaC1zZWN1cml0eS1zdGFuZGFyZC10by1tb25pdG9yLWhhY2tpbmctYXR0ZW1wdHMtMTE2NjAxMjM4MDI?oc=5


In [7]:
df = pd.read_csv('./data/news/bank_news.csv')
save_df = process_for_label(df)
save_df.to_csv('./data/news/bank_label.csv')

  Couldn't download article at url https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vdGhlaGlsbC5jb20vcG9saWN5L2VuZXJneS1lbnZpcm9ubWVudC81NzQ0MjEtMTEtYmlyZHMtMi1maXNoLWFtb25nLTIzLXNwZWNpZXMtZGVjbGFyZWQtZXh0aW5jdC_SAWtodHRwczovL3RoZWhpbGwuY29tL3BvbGljeS9lbmVyZ3ktZW52aXJvbm1lbnQvNTc0NDIxLTExLWJpcmRzLTItZmlzaC1hbW9uZy0yMy1zcGVjaWVzLWRlY2xhcmVkLWV4dGluY3QvYW1wLw?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMidWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vZm9yYmVzLzIwMTAvMDkyNy9wb2xpdGljcy1zb2NpYWxpc20tY2FwaXRhbGlzbS1wcml2YXRlLWVudGVycHJpc2VzLW9iYW1hLWJ1c2luZXNzLXByb2JsZW0uaHRtbNIBAA?oc=5
  Couldn't download article at url https://news.google.com/rss/articles/CBMihgFodHRwczovL3d3dy5mb3JiZXMuY29tL3NpdGVzL2ppbWNsYXNoLzIwMjIvMDgvMTIvcm9nZXItZGFsdHJleS1vbi1rbmlnaHRob29kLWZlYXItZGVhdGgtbXIta2liYmxld2hpdGUtbWFyaW8tYW5kcmV0dGktaGlzLW93bi1lcGl0YXBoL9IBigFodHRwczovL3d3dy5mb3JiZXMuY29tL3NpdGVzL2ppbWNsYXNoLzIwMjIvMDgvMTIvcm9nZXItZGFsdHJleS1vbi1rbmlnaHRob29kLWZlYXItZGVhdGgtbXIta2li

In [2]:
Chevron_news = """
1 	
https://www.ifrs.org/news-and-events/news/2019/11/nick-anderson-ifrs-standards-and-climate-related-disclosures/

2 	
https://www.fasb.org/cs/BlobServer?blobkey=id&blobnocache=true&blobwhere=1175836268408&blobheader=application%2Fpdf& blobheadername2=Content-Length&blobheadername1=Content-Disposition&blobheadervalue2=333644&blobheadervalue1=filename%
3DFASB_Staff_ESG_Educational_Paper_FINAL.pdf&blob col=urldata&blobtable=MungoBlobs

3 	
https://www.iaasb.org/news-events/2020-10/iaasb-issues-staff-audit-practice-alert-climate-related-risks

4 	
https://www.unpri.org/download?ac=11558

5 	
See e.g. https://www.gfanzero.com/press/amount-of-finance-committed-to-achieving-1-5c-now-at-scale-needed-to-deliver- the-transition/

6 	
https://www.iea.org/reports/world-energy-outlook-2021 p. 101

7 	
https://carbontracker.org/reports/flying-blind-the-glaring-absence-of-climate-risks-in-financial-reporting/

8 	
executive-summary-climate-resiliency-report.pdf, p. 3

9 	
https://carbontracker.org/reports/flying-blind-the-glaring-absence-of-climate-risks-in-financial-reporting/

10 	
https://www.sec.gov/Archives/edgar/data/34088/000121465921004380/cg421210px14a6g.htm, p. 6


1 	
https://www.unep.org/news-and-stories/press-release/methane-observatory-launched-boost-action-powerful-climate-warming

2 	
https://www.epa.gov/ghgemissions/overview-greenhouse-gases

3 	
https://www.edf.org/climate/methane-studies

4 	
https://www.science.org/doi/full/10.1126/science.aar7204, https://www.seas.harvard.edu/news/2021/03/oil-and-natural-gas-production-emit-more-methane-previously-thought

5 	
https://business.edf.org/files/Investors-Guide-to-the-OGMP_09.17.21_FINAL.pdf

6 	
https://www.ceres.org/news-center/press-releases/major-investors-demand-ambitious-methane-regulations-us

7 	
https://ec.europa.eu/commission/presscorner/detail/en/IP_21_4785

8 	
http://ogmpartnership.com/partners

9 	
https://www.ceres.org/sites/default/files/reports/2021-06/OilandGas_BenchmarkingReport_FINAL.pdf

https://www.spglobal.com/marketintelligence/en/news-insights/latest-news-headlines/racial-inequity-a-systemic-risk-8211-state-street- global-advisors-ceo-62047105

2 	
https://www.forbes.com/sites/forbeshumanresourcescouncil/2021/05/19/15-key-benefits-of-dei-to-communicate-with-team- members/?sh=78cbb835195c

3 	
https://b8f65cb373b1b7b15feb-c70d8ead6ced550b4d987d7c03fcdd1d.ssl.cf3.rackcdn.com/cms/reports/documents/000/002/327/original/Carbon-Majors-Report- 2017.pdf?1499691240

4 	
https://e360.yale.edu/features/unequal-impact-the-deep-links-between-inequality-and-climate-change; https://blog.ucsusa.org/kathy- mulvey/six-ways-chevron-imperils-climate-human-rights-and-racial-justice/

5 	
https://www.scientificamerican.com/article/pollution-poverty-people-color-living-industry/

6 	
https://www.theguardian.com/environment/2019/oct/09/richmond-chevron-california-city-polluter-fossil-fuel; https://ej4all.org/life-at- the-fenceline

7 	
https://www.eastbaytimes.com/2014/10/10/chevron-unleashes-campaign-spending-to-influence-richmond-election/

8 	
https://stopthemoneypipeline.com/wall-street-and-fossil-fuel-companies-are-funding-police-violence/

9 	
https://www.mercurynews.com/2021/02/15/richmond-pays-135k-to-settle-brutality-lawsuit-against-controversial-cop-officer- involved-is-on-admin-leave-for-an-unrelated-investigation/; https://eastbayexpress.com/more-questions-than-answers-in-richmond-police- shooting-1/

10 	
https://www.theguardian.com/us-news/2020/jul/27/fossil-fuels-oil-gas-industry-police-foundations

11 	
https://www.globalwitness.org/en/campaigns/oil-gas-and-mining/chevron-stop-funding-racism/

12 	
https://chevronsglobaldestruction.com/chevrons_global_destruction_report.pdf

13 	
https://www.business-humanrights.org/en/latest-news/the-case-of-chevron-in-the-ecuadorian-amazon-the-ruling-of-the- supreme- court-of-canada-closes-the-doors-to-end-impunity/

14 	
https://ejatlas.org/print/indonesia-against-chevron

15 	
https://sustainable-economy.org/wp-content/uploads/2018/02/Chevron-HR-in-Niger-Delta-Chad-Cam.pdf

16 	
https://www.npr.org/templates/story/story.php?storyId=121853115

17 	
https://insideclimatenews.org/news/20062020/chevron-black-lives-matter-twitter/

18 	
https://chevronsglobaldestruction.com/chevrons_global_destruction_report.pdf

 	
https://docs.house.gov/meetings/GO/GO00/20211028/114185/HHRG-117-GO00-20211028-SD018.pdf
"""

Chevron_news.split('\n')

['',
 '1 \t',
 'https://www.ifrs.org/news-and-events/news/2019/11/nick-anderson-ifrs-standards-and-climate-related-disclosures/',
 '',
 '2 \t',
 'https://www.fasb.org/cs/BlobServer?blobkey=id&blobnocache=true&blobwhere=1175836268408&blobheader=application%2Fpdf& blobheadername2=Content-Length&blobheadername1=Content-Disposition&blobheadervalue2=333644&blobheadervalue1=filename%',
 '3DFASB_Staff_ESG_Educational_Paper_FINAL.pdf&blob col=urldata&blobtable=MungoBlobs',
 '',
 '3 \t',
 'https://www.iaasb.org/news-events/2020-10/iaasb-issues-staff-audit-practice-alert-climate-related-risks',
 '',
 '4 \t',
 'https://www.unpri.org/download?ac=11558',
 '',
 '5 \t',
 'See e.g. https://www.gfanzero.com/press/amount-of-finance-committed-to-achieving-1-5c-now-at-scale-needed-to-deliver- the-transition/',
 '',
 '6 \t',
 'https://www.iea.org/reports/world-energy-outlook-2021 p. 101',
 '',
 '7 \t',
 'https://carbontracker.org/reports/flying-blind-the-glaring-absence-of-climate-risks-in-financial-report