In [None]:
!pip install openai



In [None]:
## imports
from typing import List, Tuple  # for type hints

import numpy as np  # for manipulating arrays
import pandas as pd  # for manipulating data in dataframes
import pickle  # for saving the embeddings cache
import plotly.express as px  # for plots
import random  # for generating run IDs
from sklearn.model_selection import train_test_split  # for splitting train & test data
import torch  # for matrix optimization

from openai.embeddings_utils import get_embedding, cosine_similarity  # for embeddings


In [None]:
# input parameters
embedding_cache_path = "snli_embedding_cache.pkl"  # embeddings will be saved/loaded here
default_embedding_engine = "babbage-similarity"  # text-embedding-ada-002 is recommended
num_pairs_to_embed = 1000  # 1000 is arbitrary
local_dataset_path = "data/snli_1.0_train_2k.csv"  # download from: https://nlp.stanford.edu/projects/snli/


In [None]:
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    # you can customize this to preprocess your own dataset
    # output should be a dataframe with 3 columns: text_1, text_2, label (1 for similar, -1 for dissimilar)
    df["label"] = df["gold_label"]
    df = df[df["label"].isin(["entailment"])]
    df["label"] = df["label"].apply(lambda x: {"entailment": 1, "contradiction": -1}[x])
    df = df.rename(columns={"sentence1": "text_1", "sentence2": "text_2"})
    df = df[["text_1", "text_2", "label"]]
    df = df.head(num_pairs_to_embed)
    return df


In [None]:
# sample = pd.read_csv("/Users/ruiqizhou/Downloads/annotation_10.csv")

In [None]:
sample = pd.read_csv("/Users/ruiqizhou/Downloads/annotation_1.csv", nrows=1000)

In [None]:
sample

Unnamed: 0,title & content,sentiment,summary,description,Ticker,Sector,Industry,Company,news_embedding
0,"This 4th of July, Amazon is stacked with deals...","{'positive': 0.093311146, 'negative': 0.021053...","This 4th of July, Amazon is offering deals on ...",You'll wear these flowy maxi dresses 'til the ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.015432381243255985, -0.0034410926217465602..."
1,Investors Spurn Dividend-Paying Stocks as AI B...,"{'positive': 0.12919189, 'negative': 0.7902378...",Shares of companies with chunky dividends were...,The shares have suffered their worst first-hal...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.024306956033299715, -0.02670285239435737, ..."
2,'Haven’t had a speeding ticket in years!' This...,"{'positive': 0.038932458, 'negative': 0.191211...",The Uniden DFR1 radar detector is available at...,"If you're feeling fast and furious, this gadge...",AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.006583285561824122, 0.006980889960853821, ..."
3,A flight attendant shares the 10 items she alw...,"{'positive': 0.09565405, 'negative': 0.0176123...","A flight attendant Sakkara Barnwell, a former ...",Yahoo talked to a travel pro to get her clever...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.01807667332348691, 0.006251572691275958, 0...."
4,"Run, don't walk: A podiatrist points us to the...","{'positive': 0.13780414, 'negative': 0.0154630...","This is the 4th of July weekend, when Amazon i...",These supportive sneakers are just what the do...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.015015614536495827, -0.006239550785772387, ..."
...,...,...,...,...,...,...,...,...,...
995,"2023 has barely started and Amazon, Salesforce...","{'positive': 0.006924565, 'negative': 0.968069...","\n\n• Amazon will be cutting over 18,000 jobs,...",The rash of layoffs come after tech companies ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.021191236326306515, -0.040097916151262285,..."
996,Save up to 49% off cleaning products this Janu...,"{'positive': 0.024229906, 'negative': 0.814925...",Save up to 49% off cleaning products this Janu...,SHOPPING: Get a heads start on sprucing up you...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.013427899243692174, 0.011660044102867324, -..."
997,AP News Digest 3:20 am - Here are the AP’s lat...,"{'positive': 0.02699591, 'negative': 0.1537630...","Eight family members, five of them children, w...","Here are the AP’s latest coverage plans, top s...",AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.003439105008929281, 0.010130104979765835, ..."
998,"Amazon to cut 18,000 jobs - live updates - Ama...","{'positive': 0.009658997, 'negative': 0.971661...",The majority of job losses will be in in Amazo...,Amazon will extend its job cutting plans this ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.01570868787498086, -0.019182098087241917, ..."


In [None]:
topics = pd.read_csv("/Users/ruiqizhou/Downloads/SASB_embeddings.csv")

In [None]:
topics

Unnamed: 0,Sector,Industry,SASB,SASB_embedding
0,Health Care,Medical Equipment & Supplies,{'Product Safety': 'Information on product saf...,"{'Product Safety': [0.009879209524762098, -0.0..."
1,Transportation,Airlines,{'Competitive Behaviour': 'The Airlines indust...,{'Competitive Behaviour': [8.349528820836388e-...
2,Consumer Goods,Multiline and Specialty Retailers & Distributors,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...
3,Technology & Communications,Hardware,{'Supply Chain Management': 'Entities in the H...,{'Supply Chain Management': [-0.00200234759469...
4,Health Care,Biotechnology & Pharmaceuticals,"{'Employee Recruitment, Development & Retentio...","{'Employee Recruitment, Development & Retentio..."
...,...,...,...,...
62,Extractives & Minerals Processing,Iron & Steel Producers,{'Greenhouse Gas Emissions': 'Iron and steel p...,{'Greenhouse Gas Emissions': [-0.0024575652239...
63,Infrastructure,Engineering & Construction Services,{'Climate Impacts of Business Mix': 'Engineeri...,{'Climate Impacts of Business Mix': [0.0138161...
64,Infrastructure,Waste Management,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...
65,Consumer Goods,Appliance Manufacturing,{'Product Safety': 'Product safety is of utmos...,"{'Product Safety': [0.009562687448008114, -0.0..."


In [None]:
import ast

# Convert the string representation of dictionaries in the 'SASB' column to actual dictionaries
topics['SASB'] = topics['SASB'].apply(ast.literal_eval)

# Check the type of the first element in the 'SASB' column to confirm the conversion
type(topics.loc[0, 'SASB'])

dict

In [None]:
merged_table = pd.merge(sample, topics, on= ['Sector', 'Industry'])

In [None]:
merged_table

Unnamed: 0,title & content,sentiment,summary,description,Ticker,Sector,Industry,Company,news_embedding,SASB,SASB_embedding
0,"This 4th of July, Amazon is stacked with deals...","{'positive': 0.093311146, 'negative': 0.021053...","This 4th of July, Amazon is offering deals on ...",You'll wear these flowy maxi dresses 'til the ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.015432381243255985, -0.0034410926217465602...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
1,Investors Spurn Dividend-Paying Stocks as AI B...,"{'positive': 0.12919189, 'negative': 0.7902378...",Shares of companies with chunky dividends were...,The shares have suffered their worst first-hal...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.024306956033299715, -0.02670285239435737, ...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
2,'Haven’t had a speeding ticket in years!' This...,"{'positive': 0.038932458, 'negative': 0.191211...",The Uniden DFR1 radar detector is available at...,"If you're feeling fast and furious, this gadge...",AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.006583285561824122, 0.006980889960853821, ...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
3,A flight attendant shares the 10 items she alw...,"{'positive': 0.09565405, 'negative': 0.0176123...","A flight attendant Sakkara Barnwell, a former ...",Yahoo talked to a travel pro to get her clever...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.01807667332348691, 0.006251572691275958, 0....",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
4,"Run, don't walk: A podiatrist points us to the...","{'positive': 0.13780414, 'negative': 0.0154630...","This is the 4th of July weekend, when Amazon i...",These supportive sneakers are just what the do...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.015015614536495827, -0.006239550785772387, ...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
...,...,...,...,...,...,...,...,...,...,...,...
995,"2023 has barely started and Amazon, Salesforce...","{'positive': 0.006924565, 'negative': 0.968069...","\n\n• Amazon will be cutting over 18,000 jobs,...",The rash of layoffs come after tech companies ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.021191236326306515, -0.040097916151262285,...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
996,Save up to 49% off cleaning products this Janu...,"{'positive': 0.024229906, 'negative': 0.814925...",Save up to 49% off cleaning products this Janu...,SHOPPING: Get a heads start on sprucing up you...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[0.013427899243692174, 0.011660044102867324, -...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
997,AP News Digest 3:20 am - Here are the AP’s lat...,"{'positive': 0.02699591, 'negative': 0.1537630...","Eight family members, five of them children, w...","Here are the AP’s latest coverage plans, top s...",AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.003439105008929281, 0.010130104979765835, ...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...
998,"Amazon to cut 18,000 jobs - live updates - Ama...","{'positive': 0.009658997, 'negative': 0.971661...",The majority of job losses will be in in Amazo...,Amazon will extend its job cutting plans this ...,AMZN,Consumer Goods,E-Commerce,Amazon.com Inc,"[-0.01570868787498086, -0.019182098087241917, ...",{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...


In [None]:
merged_table.loc[0,'SASB_embedding']

"{'Competitive Behaviour': [8.349528820836388e-05, -0.014206097499907923, 0.012499265406737183, -0.030171545304434517, -0.04230318602500391, 0.0027374967541469153, -0.007293426451545461, 0.01769853946908259, -0.024171372941290382, -0.03153701135150014, -0.00407342148189606, 0.04324850824313122, -0.0034661828634507015, 0.004358987335791376, 0.0038370906945335856, -0.006594281661558153, 0.03311254838171234, 0.001106979328423451, 0.010674268036300527, -0.007142437175709266, -0.03534455979539467, 0.004441046633725143, -0.038443118438338864, -0.02696795210123961, -0.0006474474811834694, 0.029672626486630757, 0.015006995800706649, -0.03263988727264868, 0.012912844342829173, 0.01909026508753347, 0.006354668660603168, 0.012729031068422698, -0.007319685557269427, 0.00476928382546021, -0.025536838988356007, -0.009466355692256123, 0.009413837480808191, 0.007096484322768936, 0.022517059059567576, 0.006761682238187555, 0.005629264764891894, 0.01544026941533758, -0.026560938057993935, 0.018381273423

In [None]:
merged_table.loc[0,'news_embedding']

'[-0.013338859064795797, -0.011341818499310643, 0.01203045330552243, -0.009048663924073148, -0.02160936573235543, -0.0020194221093831504, -0.03539583797998243, -0.010398387808972126, 0.012395429305779844, -0.024198632454700136, 0.008277392866610143, 0.022311772936668233, -0.010329524887144486, 0.000880591970319205, 0.016306876979466627, -0.005977351626660858, 0.008862731893096621, -0.03751683199102185, 0.025534584872820665, -0.025933993730975748, -0.019626097564971294, 0.023496225250387335, -0.008972913871872436, 0.006004896888524171, -0.018221281293700563, -0.003394970404874737, 0.019102733398616817, -0.01492960643572049, -0.004310855134858019, 0.002482529007247351, -0.0013256223046375218, -0.008022597913805976, -0.027807080850906638, 0.019598550906124133, -0.03153948083002227, -0.01166547637394245, 0.002425716700927458, -0.013517904896721816, 0.03264129875513529, -0.011796316949869785, -0.008318710060913188, 0.012905019211388176, -0.004878978663718229, 0.006931111149913854, -0.019515

In [None]:
df = pd.read_excel("/Users/ruiqizhou/Downloads/ANNOTATAION_4_UPDATE.xlsx")

In [None]:
df

Unnamed: 0,index,title & content,sentiment,summary,description,Ticker,Sector,Industry,Company,SASB,...,GPT's Label ->,GPT_ESG_or_not,GPT_firm_or_not,GPT_sentiment,GPT_topics,Human Label ->,ESG_or_not,firm_or_not,sentiment.1,topics
0,12024,"Delays Won‚Äôt Hurt Japan‚Äôs First Casino, Os...","{'positive': 0.026093118, 'negative': 0.923414...",Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äôs first ca...,MGM,Services,Casinos & Gaming,MGM Resorts International,{'Internal Controls on Money Laundering': 'By ...,...,,Minor focus,Major focus,Neutral,Responsible Gaming,,No focus,,,
1,20675,MetLife (MET) Could Be a Great Choice - Gettin...,"{'positive': 0.8553927, 'negative': 0.01334850...",MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,Financials,Insurance,Metlife Inc,{'Financed Emissions': 'Entities participating...,...,,No focus,No focus,Neutral,,,No focus,,,
2,33685,New York Cements Itself as the Gold Mining Cap...,"{'positive': 0.46477953, 'negative': 0.0338994...","This week, top-five producer AngloGold Ashanti...",(Bloomberg) -- The momentum has been building ...,NEM,Extractives & Minerals Processing,Metals & Mining,Newmont Corp,{'Tailings Storage Facilities Management': 'Th...,...,,Minor focus,Major focus,Positive,"Community Relations, Business Ethics & Transpa...",,No focus,,,
3,12072,"Shareholders v. Tesla, Nasdaq's diversity rule...","{'positive': 0.02043453, 'negative': 0.6323841...",\n\nThe case is In re Tesla Inc Securities Lit...,Some of the biggest securities cases of 2023 a...,NDAQ,Financials,Security & Commodity Exchanges,Nasdaq Inc,{'Managing Conflicts of Interest': 'Security a...,...,,Major focus,Major focus,Negative,"Managing Conflicts of Interest, Promoting Tran...",,Major focus,Major focus,Positive,
4,28164,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...","CFOs Boost Currency Protections, Extend Hedge ...","Coca-Cola, Kimberly-Clark and Prologis are amo...",KO,Food & Beverage,Non-Alcoholic Beverages,Coca-Cola Co,{'Water Management': 'Water management relates...,...,,No focus,No focus,Neutral,,,No focus,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,21471,ONEOK shares drop on concerns over Magellan pi...,"{'positive': 0.5536193, 'negative': 0.27749515...",Shares of gas pipeline operator ONEOK Inc fell...,Shares of gas pipeline operator ONEOK Inc fell...,OKE,Extractives & Minerals Processing,Oil & Gas - Midstream,ONEOK Inc,{'Greenhouse Gas Emissions': 'The midstream in...,...,,Minor focus,Minor focus,Negative,"Competitive Behaviour, None",,,,,
601,58375,"The Zacks Analyst Blog Atmos Energy, TopBuild,...","{'positive': 0.4398147, 'negative': 0.5154524,...",The Zacks Analyst Blog has highlighted four st...,"Atmos Energy, TopBuild, Axcelis Technologies a...",ATO,Infrastructure,Gas Utilities & Distributors,Atmos Energy Corp,{'Integrity of Gas Delivery Infrastructure': '...,...,,Minor focus,Minor focus,Positive,"Energy Affordability, End-Use Efficiency",,,,,
602,5490,Biden admin works on 'green' natural gas as U....,"{'positive': 0.21691072, 'negative': 0.0136983...",The United States is seeking to sustain its li...,The Biden administration is holding talks with...,WMB,Extractives & Minerals Processing,Oil & Gas - Midstream,The Williams Companies Inc,{'Greenhouse Gas Emissions': 'The midstream in...,...,,Major focus,Minor focus,Neutral,"Greenhouse Gas Emissions, Operational Safety, ...",,,,,
603,23700,Medtronic Executive Vice President and CFO Kar...,"{'positive': 0.1169801, 'negative': 0.01293078...",Medtronic Executive Vice President and CFO Kar...,"Medtronic plc (NYSE: MDT), a global leader in ...",MDT,Health Care,Medical Equipment & Supplies,Medtronic plc,{'Product Safety': 'Information on product saf...,...,,No focus,No focus,Neutral,,,,,,


In [None]:
df['content'] = df['title & content']

In [None]:
df = df[['content','sentiment','Sector', 'Industry','max_cosine_similarities','ESG_or_not']]

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not
0,"Delays Won‚Äôt Hurt Japan‚Äôs First Casino, Os...","{'positive': 0.026093118, 'negative': 0.923414...",Services,Casinos & Gaming,0.797453,No focus
1,MetLife (MET) Could Be a Great Choice - Gettin...,"{'positive': 0.8553927, 'negative': 0.01334850...",Financials,Insurance,0.784913,No focus
2,New York Cements Itself as the Gold Mining Cap...,"{'positive': 0.46477953, 'negative': 0.0338994...",Extractives & Minerals Processing,Metals & Mining,0.771926,No focus
3,"Shareholders v. Tesla, Nasdaq's diversity rule...","{'positive': 0.02043453, 'negative': 0.6323841...",Financials,Security & Commodity Exchanges,0.797389,Major focus
4,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,No focus
...,...,...,...,...,...,...
600,ONEOK shares drop on concerns over Magellan pi...,"{'positive': 0.5536193, 'negative': 0.27749515...",Extractives & Minerals Processing,Oil & Gas - Midstream,0.783972,
601,"The Zacks Analyst Blog Atmos Energy, TopBuild,...","{'positive': 0.4398147, 'negative': 0.5154524,...",Infrastructure,Gas Utilities & Distributors,0.752961,
602,Biden admin works on 'green' natural gas as U....,"{'positive': 0.21691072, 'negative': 0.0136983...",Extractives & Minerals Processing,Oil & Gas - Midstream,0.812575,
603,Medtronic Executive Vice President and CFO Kar...,"{'positive': 0.1169801, 'negative': 0.01293078...",Health Care,Medical Equipment & Supplies,0.781460,


In [None]:
df = pd.merge(df, topics, on= ['Sector', 'Industry'])

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding
0,"Delays Won‚Äôt Hurt Japan‚Äôs First Casino, Os...","{'positive': 0.026093118, 'negative': 0.923414...",Services,Casinos & Gaming,0.797453,No focus,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
1,Anti-Casino Rally Held On Eve Of Legislature V...,"{'positive': 0.036242466, 'negative': 0.826487...",Services,Casinos & Gaming,0.778137,Major focus,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
2,Las Vegas Strip Brings Back Superstar Singer -...,"{'positive': 0.08273419, 'negative': 0.0216881...",Services,Casinos & Gaming,0.793890,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
3,Las Vegas Strip Casinos Accused of Illegally P...,"{'positive': 0.018532103, 'negative': 0.873039...",Services,Casinos & Gaming,0.787558,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
4,Blackjack On Broadway: Caesars Palace Casino M...,"{'positive': 0.11247342, 'negative': 0.0635238...",Services,Casinos & Gaming,0.796483,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
...,...,...,...,...,...,...,...,...
600,"Monday Afternoon Update: 'Digital Fentanyl,' G...","{'positive': 0.04458833, 'negative': 0.4345133...",Services,Leisure Facilities,0.735942,No,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."
601,Live Nation exec will face lawmakers about Tay...,"{'positive': 0.0100895455, 'negative': 0.93815...",Services,Leisure Facilities,0.740271,Major,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."
602,Live Nation posts 73% jump in revenue and reco...,"{'positive': 0.57705677, 'negative': 0.2456571...",Services,Leisure Facilities,0.746974,Minor,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."
603,Report: DOJ Probes Live Nation After Swift Tic...,"{'positive': 0.018843884, 'negative': 0.824175...",Services,Leisure Facilities,0.740032,No focus,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."


In [None]:
df = df.dropna()

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding
0,"Delays Won‚Äôt Hurt Japan‚Äôs First Casino, Os...","{'positive': 0.026093118, 'negative': 0.923414...",Services,Casinos & Gaming,0.797453,No focus,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
1,Anti-Casino Rally Held On Eve Of Legislature V...,"{'positive': 0.036242466, 'negative': 0.826487...",Services,Casinos & Gaming,0.778137,Major focus,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
2,Las Vegas Strip Brings Back Superstar Singer -...,"{'positive': 0.08273419, 'negative': 0.0216881...",Services,Casinos & Gaming,0.793890,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
3,Las Vegas Strip Casinos Accused of Illegally P...,"{'positive': 0.018532103, 'negative': 0.873039...",Services,Casinos & Gaming,0.787558,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
4,Blackjack On Broadway: Caesars Palace Casino M...,"{'positive': 0.11247342, 'negative': 0.0635238...",Services,Casinos & Gaming,0.796483,No,{'Internal Controls on Money Laundering': 'By ...,{'Internal Controls on Money Laundering': [0.0...
...,...,...,...,...,...,...,...,...
599,Steel Dynamics (STLD) Stock Sinks As Market Ga...,"{'positive': 0.06161075, 'negative': 0.8832018...",Extractives & Minerals Processing,Iron & Steel Producers,0.752282,No focus,{'Greenhouse Gas Emissions': 'Iron and steel p...,{'Greenhouse Gas Emissions': [-0.0024575652239...
600,"Monday Afternoon Update: 'Digital Fentanyl,' G...","{'positive': 0.04458833, 'negative': 0.4345133...",Services,Leisure Facilities,0.735942,No,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."
601,Live Nation exec will face lawmakers about Tay...,"{'positive': 0.0100895455, 'negative': 0.93815...",Services,Leisure Facilities,0.740271,Major,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."
602,Live Nation posts 73% jump in revenue and reco...,"{'positive': 0.57705677, 'negative': 0.2456571...",Services,Leisure Facilities,0.746974,Minor,{'Customer Safety': 'Leisure facility entities...,"{'Customer Safety': [0.012655746936361898, -0...."


In [None]:
# split data into train and test sets
test_fraction = 0.5  # 0.5 is fairly arbitrary
random_seed = 123  # random seed is arbitrary, but is helpful in reproducibility
train_df, test_df = train_test_split(
    df, test_size=test_fraction, random_state=random_seed
)
train_df.loc[:, "dataset"] = "train"
test_df.loc[:, "dataset"] = "test"

In [None]:
df = pd.concat([train_df, test_df])

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,No focus,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,Minor focus,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,Major focus,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,Minor,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,Major,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train
...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,No,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,No,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,No focus,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,Major focus,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test


In [None]:
# check that training and test sets are balanced
px.histogram(
    df,
    x="max_cosine_similarities",
    color="ESG_or_not",
    barmode="overlay",
    width=500,
    facet_row="dataset",
).show()

In [None]:
def replace_values(x):
    if 'no' in x.lower():
        return 'No'
    else:
        return 'Focus'
    return x

df['ESG_or_not'] = df['ESG_or_not'].apply(replace_values)


In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,No,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,No,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,Focus,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,No,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,Focus,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train
...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,No,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,No,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,No,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,Focus,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test


In [None]:
px.histogram(
    df,
    x="max_cosine_similarities",
    color="ESG_or_not",
    #barmode="overlay",
    width=500,
    facet_row="dataset",
).show()

In [None]:
# calculate accuracy (and its standard error) of predicting label=1 if similarity>x
# x is optimized by sweeping from -1 to 1 in steps of 0.01
def accuracy_and_se(cosine_similarity: float, labeled_similarity: int) -> Tuple[float]:
    accuracies = []
    for threshold_thousandths in range(-1000, 1000, 1):
        threshold = threshold_thousandths / 1000
        total = 0
        correct = 0
        for cs, ls in zip(cosine_similarity, labeled_similarity):
            total += 1
            if cs > threshold:
                prediction = 1
            else:
                prediction = -1
            if prediction == ls:
                correct += 1
        accuracy = correct / total
        accuracies.append(accuracy)
    a = max(accuracies)
    n = len(cosine_similarity)
    standard_error = (a * (1 - a) / n) ** 0.5  # standard error of binomial
    return a, standard_error

In [None]:
df['ESG_or_not'] = df['ESG_or_not'].map({'Focus': 1, 'No': -1})


In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train
...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test


In [None]:
for dataset in ["train", "test"]:
    data = df[df["dataset"] == dataset]
    a, se = accuracy_and_se(data["max_cosine_similarities"], data["ESG_or_not"])
    print(f"{dataset} accuracy: {a:0.1%} ± {1.96 * se:0.1%}")

train accuracy: 64.3% ± 5.6%
test accuracy: 65.5% ± 5.6%


In [None]:
import openai
openai.api_key = "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"

In [None]:
news_texts = df['content'].tolist()

In [None]:
embeddings = openai.Embedding.create(
    input=news_texts,
    engine="text-embedding-ada-002"
)

In [None]:
Embeddings = [item['embedding'] for item in embeddings['data']]

df['embedding'] = Embeddings

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset,embedding
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train,"[0.0034837801940739155, -0.018379943445324898,..."
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train,"[-0.016028370708227158, -0.038418304175138474,..."
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train,"[0.006538767833262682, -0.0335729718208313, -0..."
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train,"[-0.012135321274399757, -0.015672283247113228,..."
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train,"[0.00059090880677104, -0.040115032345056534, 0..."
...,...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test,"[-0.005872064735740423, -0.031545888632535934,..."
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test,"[-0.013726205565035343, -0.023092789575457573,..."
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test,"[-0.008213700726628304, -0.018072837963700294,..."
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test,"[-0.02998894639313221, -0.019518667832016945, ..."


In [None]:
import ast

def compute_similarity(news_embedding, sasb_string):
    sasb_dict = ast.literal_eval(sasb_string)
    similarities = {}
    for key, sasb_embedding in sasb_dict.items():
        sim = cosine_similarity(news_embedding, sasb_embedding)
        similarities[key] = sim
    return similarities

In [None]:
df['cosine_similarities'] = df.apply(lambda row: compute_similarity(row['embedding'], row['SASB_embedding']), axis=1)

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset,embedding,cosine_similarities
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train,"[0.0034837801940739155, -0.018379943445324898,...","{'Water Management': 0.7505384757725996, 'Clim..."
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train,"[-0.016028370708227158, -0.038418304175138474,...",{'Greenhouse Gas Emissions': 0.794454891784717...
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train,"[0.006538767833262682, -0.0335729718208313, -0...",{'Workforce Diversity & Inclusion': 0.76499455...
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train,"[-0.012135321274399757, -0.015672283247113228,...",{'Product Packaging & Distribution': 0.7775707...
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train,"[0.00059090880677104, -0.040115032345056534, 0...",{'Customer Health & Safety': 0.787090176529222...
...,...,...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test,"[-0.005872064735740423, -0.031545888632535934,...",{'Greenhouse Gas Emissions': 0.743747592175264...
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test,"[-0.013726205565035343, -0.023092789575457573,...",{'Tailings Storage Facilities Management': 0.7...
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test,"[-0.008213700726628304, -0.018072837963700294,...","{'Water Management': 0.7251327993457838, 'Prod..."
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test,"[-0.02998894639313221, -0.019518667832016945, ...","{'Product Safety': 0.742490300578744, 'Materia..."


In [None]:
print(df['SASB_embedding'].iloc[0])

{'Water Management': [0.013346040463976059, -0.014947034242274502, 0.01824202553272061, -0.00801161015269346, 0.01573092285614651, 0.021244717511281177, -0.009466455206371696, -0.002859865980016756, 0.00016628575198718293, -0.03648405106544122, -0.004809623527419138, 0.02403483138940294, 0.003839726980187406, 0.008762284078656165, -0.005962205980426125, -0.004560506383180152, 0.03308276793080801, -0.0062777543631288395, 0.027024238982915894, -0.012070560685472454, -0.028884313659900317, 0.004723262917416289, -0.01537219416844237, 0.014880603003810772, -0.020354538915867203, -0.008117900134235426, 0.01092129932872737, -0.028166856284492038, -0.005859237560807344, -0.02169644993283454, 0.012661798707799645, -0.017325274441921144, -0.030106648447632937, -0.0031239301529100805, -0.005683194778878461, -0.016740677680795197, 0.005291250006281176, 0.0014606574377978528, 0.007267580281899689, -0.022865641592441303, 0.015491770397677082, 0.007174576548050468, 0.010629001879486961, 0.01822873928

In [None]:
def find_max_embedding(sasb_embeddings, cosine_similarities):
    # 找到相似度最高的主题名称
    max_similarity_topic = max(cosine_similarities, key=cosine_similarities.get)
    
    # 提取该主题的embedding
    max_embedding = sasb_embeddings.get(max_similarity_topic, None)
    return max_embedding

# 应用find_max_embedding函数
df['max_embedding'] = df.apply(lambda row: find_max_embedding(row['SASB_embedding'], row['cosine_similarities']), axis=1)
df['max_embedding_word'] = df.apply(lambda row: find_max_embedding(row['SASB'], row['cosine_similarities']), axis=1)



In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset,embedding,cosine_similarities,max_embedding,max_embedding_word
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train,"[0.0034837801940739155, -0.018379943445324898,...","{'Water Management': 0.7505384757725996, 'Clim...","[0.008788215686639065, -0.005378414729180266, ...",The Hotels & Lodging industry is highly relian...
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train,"[-0.016028370708227158, -0.038418304175138474,...",{'Greenhouse Gas Emissions': 0.794454891784717...,"[-0.002340704698821943, -0.040607881330244414,...",Employees in the Air Freight & Logistics indus...
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train,"[0.006538767833262682, -0.0335729718208313, -0...",{'Workforce Diversity & Inclusion': 0.76499455...,"[-0.006217272626085497, -0.0009317641472651714...",Consumers trust retail entities with their fin...
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train,"[-0.012135321274399757, -0.015672283247113228,...",{'Product Packaging & Distribution': 0.7775707...,"[0.009207524693889902, -0.012759092727629052, ...",The business model of entities in the E-Commer...
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train,"[0.00059090880677104, -0.040115032345056534, 0...",{'Customer Health & Safety': 0.787090176529222...,"[0.0242732587449533, -0.027872304145332892, 0....",Cruise entities operate a uniquely transitory ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test,"[-0.005872064735740423, -0.031545888632535934,...",{'Greenhouse Gas Emissions': 0.743747592175264...,"[-0.0011502468158345717, -0.032458606820530075...",Organised labour plays an important role in th...
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test,"[-0.013726205565035343, -0.023092789575457573,...",{'Tailings Storage Facilities Management': 0.7...,"[-0.00513920216254887, -0.02769421873927176, 0...",Metals and mining entities face inherent tensi...
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test,"[-0.008213700726628304, -0.018072837963700294,...","{'Water Management': 0.7251327993457838, 'Prod...","[0.016293765295825713, -0.047365905599416926, ...",Entities in the Non-Alcoholic Beverages indust...
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test,"[-0.02998894639313221, -0.019518667832016945, ...","{'Product Safety': 0.742490300578744, 'Materia...","[-0.009876224631166941, -0.024663868011668925,...",Many workers in the Automobiles industry are c...


In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset,embedding,cosine_similarities,max_embedding
87,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train,"[0.0034837801940739155, -0.018379943445324898,...","{'Water Management': 0.7505384757725996, 'Clim...","[0.008788215686639065, -0.005378414729180266, ..."
53,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train,"[-0.016028370708227158, -0.038418304175138474,...",{'Greenhouse Gas Emissions': 0.794454891784717...,"[-0.002340704698821943, -0.040607881330244414,..."
301,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train,"[0.006538767833262682, -0.0335729718208313, -0...",{'Workforce Diversity & Inclusion': 0.76499455...,"[-0.006217272626085497, -0.0009317641472651714..."
184,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train,"[-0.012135321274399757, -0.015672283247113228,...",{'Product Packaging & Distribution': 0.7775707...,"[0.009207524693889902, -0.012759092727629052, ..."
419,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train,"[0.00059090880677104, -0.040115032345056534, 0...",{'Customer Health & Safety': 0.787090176529222...,"[0.0242732587449533, -0.027872304145332892, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...
521,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test,"[-0.005872064735740423, -0.031545888632535934,...",{'Greenhouse Gas Emissions': 0.743747592175264...,"[-0.0011502468158345717, -0.032458606820530075..."
19,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test,"[-0.013726205565035343, -0.023092789575457573,...",{'Tailings Storage Facilities Management': 0.7...,"[-0.00513920216254887, -0.02769421873927176, 0..."
29,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test,"[-0.008213700726628304, -0.018072837963700294,...","{'Water Management': 0.7251327993457838, 'Prod...","[0.016293765295825713, -0.047365905599416926, ..."
578,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test,"[-0.02998894639313221, -0.019518667832016945, ...","{'Product Safety': 0.742490300578744, 'Materia...","[-0.009876224631166941, -0.024663868011668925,..."


In [None]:
df.to_csv('/Users/ruiqizhou/Downloads/annotation4_after2word.csv', index=False)


In [None]:
df = pd.read_csv('/Users/ruiqizhou/Downloads/annotation4_after2word.csv')

In [None]:
df

Unnamed: 0,content,sentiment,Sector,Industry,max_cosine_similarities,ESG_or_not,SASB,SASB_embedding,dataset,embedding,cosine_similarities,max_embedding,max_embedding_word
0,More millennials are turning 40 ‚Äî and they'r...,"{'positive': 0.06741135, 'negative': 0.1715823...",Services,Hotels & Lodging,0.785040,-1,{'Water Management': 'Hotel buildings require ...,"{'Water Management': [0.013346040463976059, -0...",train,"[0.0034837801940739155, -0.018379943445324898,...","{'Water Management': 0.7505384757725996, 'Clim...","[0.008788215686639065, -0.005378414729180266, ...",The Hotels & Lodging industry is highly relian...
1,"FedEx Parks Planes, Maersk Cancels Sails: Worl...","{'positive': 0.0276921, 'negative': 0.91408736...",Transportation,Air Freight & Logistics,0.806254,-1,{'Greenhouse Gas Emissions': 'Air Freight & Lo...,{'Greenhouse Gas Emissions': [0.00950879266272...,train,"[-0.016028370708227158, -0.038418304175138474,...",{'Greenhouse Gas Emissions': 0.794454891784717...,"[-0.002340704698821943, -0.040607881330244414,...",Employees in the Air Freight & Logistics indus...
2,You could be buying shoplifted stuff on Amazon...,"{'positive': 0.034610316, 'negative': 0.476638...",Consumer Goods,Multiline and Specialty Retailers & Distributors,0.803166,1,{'Workforce Diversity & Inclusion': 'The Multi...,{'Workforce Diversity & Inclusion': [-0.022194...,train,"[0.006538767833262682, -0.0335729718208313, -0...",{'Workforce Diversity & Inclusion': 0.76499455...,"[-0.006217272626085497, -0.0009317641472651714...",Consumers trust retail entities with their fin...
3,Marketers: Turn Impressions Into Intent This H...,"{'positive': 0.09345539, 'negative': 0.0399913...",Consumer Goods,E-Commerce,0.802793,-1,{'Product Packaging & Distribution': 'A signif...,{'Product Packaging & Distribution': [0.017690...,train,"[-0.012135321274399757, -0.015672283247113228,...",{'Product Packaging & Distribution': 0.7775707...,"[0.009207524693889902, -0.012759092727629052, ...",The business model of entities in the E-Commer...
4,Royal Caribbean Inks Sustainable Shipbuilding ...,"{'positive': 0.30869853, 'negative': 0.0087451...",Transportation,Cruise Lines,0.792948,1,{'Customer Health & Safety': 'Cruise lines off...,{'Customer Health & Safety': [0.02446571152799...,train,"[0.00059090880677104, -0.040115032345056534, 0...",{'Customer Health & Safety': 0.787090176529222...,"[0.0242732587449533, -0.027872304145332892, 0....",Cruise entities operate a uniquely transitory ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,Waste Management (WM) Stock Moves -0.91%: What...,"{'positive': 0.6840206, 'negative': 0.23210213...",Infrastructure,Waste Management,0.805994,-1,{'Greenhouse Gas Emissions': 'Landfills are a ...,{'Greenhouse Gas Emissions': [-0.0001698792302...,test,"[-0.005872064735740423, -0.031545888632535934,...",{'Greenhouse Gas Emissions': 0.743747592175264...,"[-0.0011502468158345717, -0.032458606820530075...",Organised labour plays an important role in th...
551,Newmont Corporation (NEM) Stock Moves -0.58%: ...,"{'positive': 0.07588659, 'negative': 0.8466317...",Extractives & Minerals Processing,Metals & Mining,0.756432,-1,{'Tailings Storage Facilities Management': 'Th...,{'Tailings Storage Facilities Management': [0....,test,"[-0.013726205565035343, -0.023092789575457573,...",{'Tailings Storage Facilities Management': 0.7...,"[-0.00513920216254887, -0.02769421873927176, 0...",Metals and mining entities face inherent tensi...
552,"CFOs Boost Currency Protections, Extend Hedge ...","{'positive': 0.031100325, 'negative': 0.955758...",Food & Beverage,Non-Alcoholic Beverages,0.776955,-1,{'Water Management': 'Water management relates...,"{'Water Management': [0.007515041336335117, -0...",test,"[-0.008213700726628304, -0.018072837963700294,...","{'Water Management': 0.7251327993457838, 'Prod...","[0.016293765295825713, -0.047365905599416926, ...",Entities in the Non-Alcoholic Beverages indust...
553,"5,000 GM Employees Reportedly Take Buyouts In ...","{'positive': 0.013290622, 'negative': 0.963200...",Transportation,Automobiles,0.811529,1,{'Product Safety': 'Driving is a risky activit...,"{'Product Safety': [-0.00859951505673047, -0.0...",test,"[-0.02998894639313221, -0.019518667832016945, ...","{'Product Safety': 0.742490300578744, 'Materia...","[-0.009876224631166941, -0.024663868011668925,...",Many workers in the Automobiles industry are c...


In [None]:
import json
import numpy as np
import pandas as pd

# 假设df是您的DataFrame，'embedding'是需要转换的列
def convert_embedding(embedding_str):
    try:
        # 将字符串解析为数值列表
        embedding_list = json.loads(embedding_str)
        return np.array(embedding_list, dtype=np.float64)
    except json.JSONDecodeError:
        return np.nan  # 或者根据需要返回适当的默认值

# 应用转换函数
df['embedding'] = df['embedding'].apply(convert_embedding)


In [None]:
df['max_embedding'] = df['max_embedding'].apply(convert_embedding)

embeddings

In [None]:
def embedding_multiplied_by_matrix(
    embedding: List[float], matrix: torch.tensor
) -> np.array:
    embedding_tensor = torch.tensor(embedding).float()
    modified_embedding = embedding_tensor @ matrix
    modified_embedding = modified_embedding.detach().numpy()
    return modified_embedding


# compute custom embeddings and new cosine similarities
def apply_matrix_to_embeddings_dataframe(matrix: torch.tensor, df: pd.DataFrame):
    for column in ["embedding", "max_embedding"]:
        df[f"{column}_custom"] = df[column].apply(
            lambda x: embedding_multiplied_by_matrix(x, matrix)
        )
    df["cosine_similarity_custom"] = df.apply(
        lambda row: cosine_similarity(
            row["embedding_custom"], row["max_embedding_custom"]
        ),
        axis=1,
    )


In [None]:
def optimize_matrix(
    modified_embedding_length: int = 2048,  # in my brief experimentation, bigger was better (2048 is length of babbage encoding)
    batch_size: int = 100,
    max_epochs: int = 100,
    learning_rate: float = 100.0,  # seemed to work best when similar to batch size - feel free to try a range of values
    dropout_fraction: float = 0.0,  # in my testing, dropout helped by a couple percentage points (definitely not necessary)
    df: pd.DataFrame = df,
    print_progress: bool = True,
    save_results: bool = True,
) -> torch.tensor:
    """Return matrix optimized to minimize loss on training data."""
    run_id = random.randint(0, 2 ** 31 - 1)  # (range is arbitrary)
    # convert from dataframe to torch tensors
    # e is for embedding, s for similarity label
    def tensors_from_dataframe(
        df: pd.DataFrame,
        embedding_column_1: str,
        embedding_column_2: str,
        similarity_label_column: str,
    ) -> Tuple[torch.tensor]:
        e1 = np.stack(np.array(df[embedding_column_1].values))
        e2 = np.stack(np.array(df[embedding_column_2].values))
        s = np.stack(np.array(df[similarity_label_column].astype("float").values))

        e1 = torch.from_numpy(e1).float()
        e2 = torch.from_numpy(e2).float()
        s = torch.from_numpy(s).float()

        return e1, e2, s

    e1_train, e2_train, s_train = tensors_from_dataframe(
        df[df["dataset"] == "train"], "embedding", "max_embedding", "ESG_or_not"
    )
    e1_test, e2_test, s_test = tensors_from_dataframe(
        df[df["dataset"] == "test"], "embedding", "max_embedding", "ESG_or_not"
    )

    # create dataset and loader
    dataset = torch.utils.data.TensorDataset(e1_train, e2_train, s_train)
    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=True
    )

    # define model (similarity of projected embeddings)
    def model(embedding_1, embedding_2, matrix, dropout_fraction=dropout_fraction):
        e1 = torch.nn.functional.dropout(embedding_1, p=dropout_fraction)
        e2 = torch.nn.functional.dropout(embedding_2, p=dropout_fraction)
        modified_embedding_1 = e1 @ matrix  # @ is matrix multiplication
        modified_embedding_2 = e2 @ matrix
        similarity = torch.nn.functional.cosine_similarity(
            modified_embedding_1, modified_embedding_2
        )
        return similarity

    # define loss function to minimize
    def mse_loss(predictions, targets):
        difference = predictions - targets
        return torch.sum(difference * difference) / difference.numel()

    # initialize projection matrix
    embedding_length = len(df["embedding"].values[0])
    matrix = torch.randn(
        embedding_length, modified_embedding_length, requires_grad=True
    )

    epochs, types, losses, accuracies, matrices = [], [], [], [], []
    for epoch in range(1, 1 + max_epochs):
        # iterate through training dataloader
        for a, b, actual_similarity in train_loader:
            # generate prediction
            predicted_similarity = model(a, b, matrix)
            # get loss and perform backpropagation
            loss = mse_loss(predicted_similarity, actual_similarity)
            loss.backward()
            # update the weights
            with torch.no_grad():
                matrix -= matrix.grad * learning_rate
                # set gradients to zero
                matrix.grad.zero_()
        # calculate test loss
        test_predictions = model(e1_test, e2_test, matrix)
        test_loss = mse_loss(test_predictions, s_test)

        # compute custom embeddings and new cosine similarities
        apply_matrix_to_embeddings_dataframe(matrix, df)

        # calculate test accuracy
        for dataset in ["train", "test"]:
            data = df[df["dataset"] == dataset]
            a, se = accuracy_and_se(data["max_cosine_similarities"], data["ESG_or_not"])

            # record results of each epoch
            epochs.append(epoch)
            types.append(dataset)
            losses.append(loss.item() if dataset == "train" else test_loss.item())
            accuracies.append(a)
            matrices.append(matrix.detach().numpy())

            # optionally print accuracies
            if print_progress is True:
                print(
                    f"Epoch {epoch}/{max_epochs}: {dataset} accuracy: {a:0.1%} ± {1.96 * se:0.1%}"
                )

    data = pd.DataFrame(
        {"epoch": epochs, "type": types, "loss": losses, "accuracy": accuracies}
    )
    data["run_id"] = run_id
    data["modified_embedding_length"] = modified_embedding_length
    data["batch_size"] = batch_size
    data["max_epochs"] = max_epochs
    data["learning_rate"] = learning_rate
    data["dropout_fraction"] = dropout_fraction
    data[
        "matrix"
    ] = matrices  # saving every single matrix can get big; feel free to delete/change
    if save_results is True:
        data.to_csv(f"{run_id}_optimization_results.csv", index=False)

    return data



In [None]:
# example hyperparameter search
# I recommend starting with max_epochs=10 while initially exploring
results = []
max_epochs = 30
dropout_fraction = 0.2
for batch_size, learning_rate in [(10, 10), (100, 100), (1000, 1000)]:
    result = optimize_matrix(
        batch_size=batch_size,
        learning_rate=learning_rate,
        max_epochs=max_epochs,
        dropout_fraction=dropout_fraction,
        save_results=False,
    )
    results.append(result)