In [None]:
import os
import requests
import time
import pandas as pd
import numpy as np
import torch 
from torch import nn
from torch.optim import AdamW  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd 
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, pipeline, BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
import spacy
from spacy import displacy

from sklearn.model_selection import train_test_split
from tqdm import tqdm
torch.set_default_device("cpu")
import random
import json

In [None]:
with open('../dataset_json/Health/Na/Pfizer_2022_ocr.json', 'r', encoding='utf-8') as f:
    pfizer_data = json.load(f)  
pfizer_df = pd.DataFrame(pfizer_data)
pfizer_df.head(5)

In [None]:
folder_path = "../dataset_json/Tech/AsiaPac/"

json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
dfs = {}

for file in json_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON file
    if isinstance(data, list):  
        df = pd.DataFrame(data)
    else:
        print(f"Skipping {file}: Unsupported format")
        continue
    
    dfs[file] = df  
dfs['pfizer_2022_ocr.json'] = pfizer_df

In [None]:
folder_path = "../dataset_json/Tech/Na/"

json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
for file in json_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # Load JSON file
    if isinstance(data, list):  
        df = pd.DataFrame(data)
    else:
        print(f"Skipping {file}: Unsupported format")
        continue
    
    dfs[file] = df  

In [None]:
len(dfs)

In [None]:
for df in dfs.values(): 
    print('df info is')
    #print(df.info())
    #print(df.describe(include="all") )
    #print(df.columns)
    #print(df.isnull().sum())
    print(df.duplicated().sum())

In [None]:
df_combined = pd.concat([df for df in dfs.values()], ignore_index=True)  
missing_rows = df_combined[df_combined.isnull().any(axis=1)]
print(len(missing_rows))
df_combined.drop_duplicates(inplace=True)
df_combined["esg_text"].apply(type).value_counts()

In [None]:
df_combined.drop_duplicates(inplace=True)
df_combined["esg_text"].apply(type).value_counts()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "nbroad/ESG-BERT" #"nlpaueb/sec-bert-esg" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
candidate_labels = ["Environment - Energy efficiency", "Environment - Waste & Pollutants Generation", 
                    "Environment - Water Usage", "Environment - Climate Strategy", 
                    "Environment - Decarbonisation/Carbon emissions", "Environment - Strategy",
                    "Social - Labor Practices", "Social - Human Rights", "Social - Human Capital Management", 
                    "Social - Occupational Health & Safety", "Social - Financial Inclusion", "Social - Community investment",
                    "Social - Customer Relations","Social - Privacy Protection", 'Social - Gender and Ethnic Diversity',
                    "Governance - Transparency & Reporting", "Corporate Governance", "Governance - Materiality", 
                    "Governance - Risk & Crisis Management", "Governance - Business Ethics", 
                    "Governance - Policy Influence", "Governance - Tax Strategy", 
                    "Governance - Shareholder rights",
                    "Governance - Information Security/ Cybersecurity & System Availability", 
                    "Governance - Sustainable Finance", "Governance - Board Diversity"]

THRESHOLD=0.5
def classify_text(text):
    if pd.isna(text): 
        return None

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  # Softmax for classification
    probabilities = probabilities.squeeze().cpu().numpy()

    if len(probabilities) != len(candidate_labels):
        print(f"Warning: Mismatch! Probabilities: {len(probabilities)}, Labels: {len(candidate_labels)}")
        return ["Error"]

    assigned_labels = [candidate_labels[i] for i, prob in enumerate(probabilities) if prob > THRESHOLD]
    return assigned_labels if assigned_labels else ["No Label"]

tqdm.pandas()


df_combined["labels"] = df_combined["esg_text"].apply(classify_text)
print(df_combined.head())


In [None]:
df_combined.head(5)


In [None]:
df_combined['labels'].value_counts()

In [None]:
df_combined.to_csv('../labeled_pdfs_2802.csv', index=False)

## First method: resampling 
As the dataset is already very small, I do not want to do undersampling to reduce the dataset size further, so I would prefer to do oversampling. 

In [None]:
from sklearn.utils import resample

df_env = df_combined[df_combined['labels'] == 'Environment']
df_soc = df_combined[df_combined['labels'] == 'Social']
df_gov = df_combined[df_combined['labels'] == 'Governance']


target_size = max(len(df_soc), len(df_gov))  # Balance to the highest minority class

# Oversample Social & Governance
df_soc_oversampled = resample(df_soc, replace=True, n_samples=target_size, random_state=42)
df_gov_oversampled = resample(df_gov, replace=True, n_samples=target_size, random_state=42)

# Undersample Environment
df_env_undersampled = resample(df_env, replace=False, n_samples=target_size, random_state=42)

# Combine and shuffle balanced dataset
df_balanced = pd.concat([df_env_undersampled, df_soc_oversampled, df_gov_oversampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new distribution
print(df_balanced['labels'].value_counts()) ## It was 200++ for each component now.

## Model training

In [None]:
df = pd.read_csv('../files/label_map_2802.csv')

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['esg_text'].values, 
    df['label'].values, 
    test_size=0.2,
    random_state=42
)

In [None]:
len(train_texts)

In [None]:
MODEL_NAME = "nbroad/ESG-BERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(map(str, texts))  
        self.labels = list(map(str, labels))  
        self.labels = [int(label[0]) for label in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # If we receive a single index
        if isinstance(idx, int):
            text = str(self.texts[idx])
            label = int(self.labels[idx])  
    
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
    
            return {
                'input_ids': encoding['input_ids'].squeeze(0),  # Remove extra dimension
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        
        # To receive a list of indices (in case of batching)
        elif isinstance(idx, list):
            batch = [self.__getitem__(i) for i in idx]
            
            # Return batch as dictionary
            return {
                'input_ids': torch.stack([item['input_ids'] for item in batch]),
                'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
                'labels': torch.stack([item['labels'] for item in batch])
            }

In [None]:
# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=None)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=None)

In [None]:
class ESGClassifier(nn.Module):
    def __init__(self, n_classes=3):
        super(ESGClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME) 
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [None]:
# Initialize model
model = ESGClassifier()
model = model.to(device)

# Initialize optimizer
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5}, ## Changed
    {'params': model.classifier.parameters(), 'lr': 1e-3}
])

In [None]:
# Training function
def train_epoch(model, data_loader, optimizer, device):
    """
    Trains the model for one epoch and returns the average loss.
    """
    model.train()
    total_loss = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

In [None]:
def evaluate(model, data_loader, device):
    """
    Evaluates the model on the provided data loader.
    Returns accuracy and average loss.
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs, labels)
            
            _, predictions = torch.max(outputs, dim=1)
            
            total_loss += loss.item()
            correct_predictions += torch.sum(predictions == labels)
            total_predictions += labels.shape[0]
    
    # Use float32 instead of double/float64
    accuracy = (correct_predictions.float() / total_predictions) * 100  
    average_loss = total_loss / len(data_loader)
    
    return accuracy, average_loss

In [None]:
# Main training loop
def train_model(model, train_loader, val_loader, optimizer, device, epochs=3):
    """
    Main training loop that handles the entire training process.
    """
    best_accuracy = 0
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        
        # Train one epoch
        train_loss = train_epoch(model, train_loader, optimizer, device)
        
        # Evaluate
        val_accuracy, val_loss = evaluate(model, val_loader, device)
        
        # Print metrics
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        
        # Save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            # torch.save(model.state_dict(), 'esgbert_model_weights.pt') #saves model's learned parameters (weights)
            torch.save(model, "esgbert_model_full.pth") #saves full model architecture
            print('ESG-BERT model saved!')

In [None]:
%%time

# Train the model
train_model(model, train_loader, val_loader, optimizer, device)

# Evaluation 
Check class distribution and use F1-score.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate(model, data_loader, device):
    """
    Evaluates the model and returns accuracy, precision, recall, and F1-score.
    """
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, dim=1)  # Get predicted class
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')

    return accuracy, precision, recall, f1
evaluate(model, val_loader, device)

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def evaluate_auc(model, data_loader, device, num_classes=3):
    model.eval()
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs, dim=1)  # Convert logits to probabilities
            
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    auc_score = roc_auc_score(np.eye(num_classes)[all_labels], all_probs, multi_class="ovr")
    print(f"AUC-ROC Score: {auc_score:.4f}")
    return auc_score

evaluate_auc(model, val_loader, device, num_classes=3)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

class_names = ["Environmental", "Social", "Governance"]

def evaluate(model, data_loader, device):
    """
    Evaluates the model and returns accuracy, precision, recall, F1-score.
    """
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predictions = torch.max(outputs, dim=1)  # Get predicted class
            
            all_labels.extend(labels.cpu().numpy())  # Convert tensor to numpy
            all_preds.extend(predictions.cpu().numpy())

    return all_labels, all_preds  # Return both true labels & predictions



def plot_confusion_matrix(all_labels, all_preds, class_names):
    cm = confusion_matrix(all_labels, all_preds)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

all_labels, all_preds = evaluate(model, val_loader, device)
plot_confusion_matrix(all_labels, all_preds, class_names)