In [3]:
import os
import requests
import time
import pandas as pd
import numpy as np
import torch 
from torch import nn
from torch.optim import AdamW  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd 
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, pipeline, BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments

from sklearn.model_selection import train_test_split
from tqdm import tqdm
torch.set_default_device("cpu")
import random
import json

In [4]:
pwd

'/Users/chienshiyun/Documents/_DSA3101/proj/dsa3101/notebooks'

# Data Cleaning
1. Remove duplicate data 
2. Handle missing values

In [None]:
with open('../dataset_json/Health/Na/Pfizer_2022_ocr.json', 'r', encoding='utf-8') as f:
    pfizer_data = json.load(f)  
pfizer_df = pd.DataFrame(pfizer_data)
pfizer_df.head(5)

In [None]:
folder_path = "../dataset_json/Tech/AsiaPac/"

json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
dfs = {}

for file in json_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON file
    if isinstance(data, list):  
        df = pd.DataFrame(data)
    else:
        print(f"Skipping {file}: Unsupported format")
        continue
    
    dfs[file] = df  
dfs['pfizer_2022_ocr.json'] = pfizer_df

In [None]:
folder_path = "../dataset_json/Tech/Na/"

json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
for file in json_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON file
    if isinstance(data, list):  
        df = pd.DataFrame(data)
    else:
        print(f"Skipping {file}: Unsupported format")
        continue
    
    dfs[file] = df  

In [None]:
len(dfs)

In [None]:
for df in dfs.values(): 
    print('df info is')
    #print(df.info())
    #print(df.describe(include="all") )
    #print(df.columns)
    #print(df.isnull().sum())
    print(df.duplicated().sum())

In [None]:
df_combined = pd.concat([df for df in dfs.values()], ignore_index=True)  
missing_rows = df_combined[df_combined.isnull().any(axis=1)]
print(len(missing_rows))
df_combined.drop_duplicates(inplace=True)
df_combined["esg_text"].apply(type).value_counts()

In [None]:
df_combined.drop_duplicates(inplace=True)
df_combined["esg_text"].apply(type).value_counts()

# Data labeling

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "nbroad/ESG-BERT" #"nlpaueb/sec-bert-esg" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
df_combined = pd.read_csv('../combined_pdfs_2602.csv')


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
candidate_labels = ["Environment - Energy efficiency", "Environment - Waste & Pollutants Generation", 
                    "Environment - Water Usage", "Environment - Climate Strategy", 
                    "Environment - Decarbonisation/Carbon emissions", "Environment - Strategy",
                    "Social - Labor Practices", "Social - Human Rights", "Social - Human Capital Management", 
                    "Social - Occupational Health & Safety", "Social - Financial Inclusion", "Social - Community investment",
                    "Social - Customer Relations"," Social - Privacy Protection", 'Social - Gender and Ethnic Diversity',
                    "Governance - Transparency & Reporting", "Corporate Governance", "Governance - Materiality", 
                    "Governance - Risk & Crisis Management", "Governance - Business Ethics", 
                    "Governance - Policy Influence", "Governance - Tax Strategy", 
                    "Governance - Shareholder rights",
                    "Governance - Information Security/ Cybersecurity & System Availability", 
                    "Governance - Sustainable Finance", "Governance - Board Diversity"]

# def classify_text(text):
#     if pd.isna(text): 
#         return None

#     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
#     with torch.no_grad():
#         outputs = model(**inputs)

#     probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
#     predicted_class = torch.argmax(probabilities, dim=1).item()
    
#     return candidate_labels[predicted_class]

# df_combined["labels"] = df_combined["esg_text"].apply(classify_text)
# print(df_combined.head())


THRESHOLD = 0.6  
def classify_text(text):
    if pd.isna(text): 
        return None

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  # Softmax for classification
    probabilities = probabilities.squeeze().cpu().numpy()

    if len(probabilities) != len(candidate_labels):
        print(f"Warning: Mismatch! Probabilities: {len(probabilities)}, Labels: {len(candidate_labels)}")
        return ["Error"]

    assigned_labels = [candidate_labels[i] for i, prob in enumerate(probabilities) if prob > THRESHOLD]
    return assigned_labels if assigned_labels else ["No Label"]

tqdm.pandas()


df_combined["labels"] = df_combined["esg_text"].apply(classify_text)
print(df_combined.head())


                                            esg_text      labels
0  AOD  MIOLS  YALNNOD  GWNY 766  AGOD  MIOLS  YA...  [No Label]
1  This is the 18th consecutive year we have publ...  [No Label]
2  Clearly, the past year presented our world wit...  [No Label]
3  It is clear that Al will be a major priority a...  [No Label]
4  We are committed to accelerating the deploymen...  [No Label]


In [7]:
df_combined.head(5)


Unnamed: 0,esg_text
0,AOD MIOLS YALNNOD GWNY 766 AGOD MIOLS YA...
1,This is the 18th consecutive year we have publ...
2,"Clearly, the past year presented our world wit..."
3,It is clear that Al will be a major priority a...
4,We are committed to accelerating the deploymen...


In [12]:
df_combined['labels'].value_counts()

labels
[No Label]                                                                  4385
[Social - Human Capital Management]                                          752
[Governance - Transparency & Reporting]                                      564
[Social - Customer Relations]                                                505
[Governance - Materiality]                                                   463
[Environment - Energy efficiency]                                            409
[Governance - Board Diversity]                                               384
[Governance - Sustainable Finance]                                           378
[Social - Financial Inclusion]                                               308
[ Social - Privacy Protection]                                               294
[Social - Occupational Health & Safety]                                      291
[Governance - Business Ethics]                                               275
[Environment - Waste 

In [13]:
df_combined.to_csv('../labeled_pdfs_2602.csv', index=False)

## Data transformation
Since the data breakdown is 
Environment:    588
Social:         220
Governance:     161
I had to make the dataset even. I had two appraoches: resampling and data augmentation 

## First method: resampling 
As the dataset is already very small, I do not want to do undersampling to reduce the dataset size further, so I would prefer to do oversampling. 

In [None]:
from sklearn.utils import resample

df_env = df_combined[df_combined['labels'] == 'Environment']
df_soc = df_combined[df_combined['labels'] == 'Social']
df_gov = df_combined[df_combined['labels'] == 'Governance']


target_size = max(len(df_soc), len(df_gov))  # Balance to the highest minority class

# Oversample Social & Governance
df_soc_oversampled = resample(df_soc, replace=True, n_samples=target_size, random_state=42)
df_gov_oversampled = resample(df_gov, replace=True, n_samples=target_size, random_state=42)

# Undersample Environment
df_env_undersampled = resample(df_env, replace=False, n_samples=target_size, random_state=42)

# Combine and shuffle balanced dataset
df_balanced = pd.concat([df_env_undersampled, df_soc_oversampled, df_gov_oversampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new distribution
print(df_balanced['labels'].value_counts()) ## It was 200++ for each component now.

# Model Training with finBERT 
I decided to use FinBERT as it is built by further training the BERT language model in the finance domain, using a large financial corpus. Even though I am not using the model for financial sentiment classification, I felt that since I am training the model on a classification task on ESG data, which is discussed extensively in the financial sector, it would be good to use a model that was trained in the financial domain. 
I also found an ESGBERT model on huggingface https://huggingface.co/ESGBERT but I did not have the time to try it. I felt that finBERT was more well known and widely used, thus it would still produce more accurate results.
I would like to try it next time, to compare the results between BERT, FinBERT and ESGBert. 

In [None]:
df = pd.read_csv('labeled_combined_pdfs_2602.csv')

In [None]:
torch.manual_seed(42)
torch.mps.empty_cache()

device = (
    "mps" 
    if torch.backends.mps.is_available() 
    else "cuda" 
    if torch.cuda.is_available() 
    else "cpu"
)
device = torch.device(device)
print(f"Using device: {device}")


In [None]:
label_map = {
    "Environment,Energy" : 11, 
    "Environment,Waste & Pollutants": 12, 
    "Environment,Water": 13, 
    "Environment,Climate Strategy": 14, 
    "Environment,Decarbonization": 15, 
    "Environment,Strategy": 16,
    "Social,Labor Practices": 21,
    "Social,Human Rights": 22, 
    "Social,Human Capital Management": 23,
    "Social,Occupational Health & Safety":24,
    "Social,Financial Inclusion": 25,           
    "Social,Customer Relations": 26,
    "Social,Privacy Protection": 27,
    "Governance,Transparency & Reporting": 31, 
    "Corporate Governance": 32, 
    "Governance,Materiality": 33, 
    "Governance,Risk & Crisis Management": 34,              
    "Governance,Business Ethics": 35, 
    "Governance,Policy Influence": 36, 
    "Governance,Tax Strategy": 37, 
    "Governance,Information Security/ Cybersecurity & System Availability": 38, 
    "Governance,Sustainable Finance": 39
}

df['label'] = df['labels'].map(label_map)

In [None]:
df.head(5)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['combined_text'].values, 
    df['label'].values, 
    test_size=0.2,
    random_state=42
)

In [None]:
MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(map(str, texts))  
        self.labels = list(map(str, labels))  
        self.labels = [int(label) for label in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # If we receive a single index
        if isinstance(idx, int):
            text = str(self.texts[idx])
            label = int(self.labels[idx])  
    
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
    
            return {
                'input_ids': encoding['input_ids'].squeeze(0),  # Remove extra dimension
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        
        # To receive a list of indices (in case of batching)
        elif isinstance(idx, list):
            batch = [self.__getitem__(i) for i in idx]
            
            # Return batch as dictionary
            return {
                'input_ids': torch.stack([item['input_ids'] for item in batch]),
                'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
                'labels': torch.stack([item['labels'] for item in batch])
            }

In [None]:
# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=None)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=None)

In [None]:
class ESGClassifier(nn.Module):
    """
    Our sentiment classifier model.
    It uses BERT as the base model and adds a classification head on top.
    """
    def __init__(self, n_classes=3):
        super(ESGClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME) 
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [None]:
# Initialize model
model = ESGClassifier()
model = model.to(device)

# Initialize optimizer
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5}, ## Changed
    {'params': model.classifier.parameters(), 'lr': 1e-3}
])

In [None]:
# Training function
def train_epoch(model, data_loader, optimizer, device):
    """
    Trains the model for one epoch and returns the average loss.
    """
    model.train()
    total_loss = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

In [None]:
def evaluate(model, data_loader, device):
    """
    Evaluates the model on the provided data loader.
    Returns accuracy and average loss.
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs, labels)
            
            _, predictions = torch.max(outputs, dim=1)
            
            total_loss += loss.item()
            correct_predictions += torch.sum(predictions == labels)
            total_predictions += labels.shape[0]
    
    # Use float32 instead of double/float64
    accuracy = (correct_predictions.float() / total_predictions) * 100  
    average_loss = total_loss / len(data_loader)
    
    return accuracy, average_loss

In [None]:
# Main training loop
def train_model(model, train_loader, val_loader, optimizer, device, epochs=3):
    """
    Main training loop that handles the entire training process.
    """
    best_accuracy = 0
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        
        # Train one epoch
        train_loss = train_epoch(model, train_loader, optimizer, device)
        
        # Evaluate
        val_accuracy, val_loss = evaluate(model, val_loader, device)
        
        # Print metrics
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        
        # Save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pt')
            print('Best model saved!')

In [None]:
%%time

# Train the model
train_model(model, train_loader, val_loader, optimizer, device)

# Evaluation 
Check class distribution and use F1-score.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate(model, data_loader, device):
    """
    Evaluates the model and returns accuracy, precision, recall, and F1-score.
    """
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, dim=1)  # Get predicted class
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')

    return accuracy, precision, recall, f1
evaluate(model, val_loader, device)

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def evaluate_auc(model, data_loader, device, num_classes=3):
    model.eval()
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs, dim=1)  # Convert logits to probabilities
            
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    auc_score = roc_auc_score(np.eye(num_classes)[all_labels], all_probs, multi_class="ovr")
    print(f"AUC-ROC Score: {auc_score:.4f}")
    return auc_score

evaluate_auc(model, val_loader, device, num_classes=3)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

class_names = ["Environmental", "Social", "Governance"]

def evaluate(model, data_loader, device):
    """
    Evaluates the model and returns accuracy, precision, recall, F1-score.
    """
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, dim=1)  # Get predicted class
            
            all_labels.extend(labels.cpu().numpy())  # Convert tensor to numpy
            all_preds.extend(predictions.cpu().numpy())

    return all_labels, all_preds  # Return both true labels & predictions



def plot_confusion_matrix(all_labels, all_preds, class_names):
    cm = confusion_matrix(all_labels, all_preds)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

all_labels, all_preds = evaluate(model, val_loader, device)
plot_confusion_matrix(all_labels, all_preds, class_names)

# Insert Classified ESG data into PostGreSQL 

In [None]:
# CREATE TABLE ESG_Scores (
#     id SERIAL PRIMARY KEY,
#     company_name TEXT NOT NULL,
#     year INT NOT NULL,
#     carbon_emissions_reduction NUMERIC, 
#     renewable_energy_usage NUMERIC, 
#     waste_management NUMERIC,
#     workforce_diversity NUMERIC, 
#     employee_rights NUMERIC, 
#     product_safety NUMERIC,
#     board_independence NUMERIC, 
#     transparency NUMERIC, 
#     executive_pay_equity NUMERIC, 
#     anti_corruption_policies NUMERIC,
#     overall_esg_score NUMERIC,  
#     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
# );

In [None]:
# import psycopg2
# from datetime import datetime

# # PostgreSQL connection
# conn = psycopg2.connect(
#     dbname="your_database",
#     user="your_user",
#     password="your_password",
#     host="localhost",
#     port="5432"
# )
# cursor = conn.cursor()

# # Example extracted ESG data
# esg_data = {
#     "company_name": "Pfizer",
#     "year": 2023,
#     "carbon_emissions_reduction": 7.5,
#     "renewable_energy_usage": 8.0,
#     "waste_management": 7.0,
#     "workforce_diversity": 6.5,
#     "employee_rights": 8.0,
#     "product_safety": 7.5,
#     "board_independence": 8.5,
#     "transparency": 8.0,
#     "executive_pay_equity": 6.0,
#     "anti_corruption_policies": 7.5,
#     "overall_esg_score": 7.43
# }

# # SQL INSERT query
# query = """
# INSERT INTO ESG_Scores (company_name, year, carbon_emissions_reduction, renewable_energy_usage,
#                         waste_management, workforce_diversity, employee_rights, product_safety,
#                         board_independence, transparency, executive_pay_equity, anti_corruption_policies, 
#                         overall_esg_score, created_at) 
# VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
# """

# # Execute query
# cursor.execute(query, (
#     esg_data["company_name"], esg_data["year"], esg_data["carbon_emissions_reduction"],
#     esg_data["renewable_energy_usage"], esg_data["waste_management"], esg_data["workforce_diversity"],
#     esg_data["employee_rights"], esg_data["product_safety"], esg_data["board_independence"],
#     esg_data["transparency"], esg_data["executive_pay_equity"], esg_data["anti_corruption_policies"],
#     esg_data["overall_esg_score"], datetime.now()
# ))

# # Commit and close
# conn.commit()
# cursor.close()
# conn.close()

# print("ESG data inserted successfully!")
