In [16]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
import numpy as np
import time
import torch
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from transformers import set_seed
from transformers import AutoTokenizer
from torch.optim import AdamW
from typing import Tuple
from tqdm import tqdm
set_seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [17]:
from transformers import AutoModelForSequenceClassification
from torch.nn import LogSoftmax
class EsgBert(torch.nn.Module):

    def __init__(self):

        super(EsgBert, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained('nbroad/ESG-BERT')

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        logits = output[0]
        return logits



In [18]:
class EsgDataset():
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __getitem__(self, idx):
        input_ids =  {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return input_ids

    def __len__(self):
        return len(self.encodings.input_ids)

In [19]:
class EsgBertPredict():
    def __init__(self, csv_source, csv_output, nation, hyper_parameters) -> None:
        self.CSV_SOURCE = csv_source
        self.CSV_OUTPUT = csv_output
        self.NATION = nation
        self.HP = hyper_parameters
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.df = None

    def _read_csv(self):
        self.df = pd.read_csv(f'{self.CSV_SOURCE}/american/4_apple_1_72.csv')
        x = self.df['paragraph']
        return x
    
    def _tokenizer(self, x):
        tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')
        inference_encodings = tokenizer(x.to_list(), truncation=True, padding=True)
        return inference_encodings
    
    def _get_dataset(self, inference_encodings):
        dataset = EsgDataset(inference_encodings)
        return dataset
    
    def _get_dataloader(self, inference) -> DataLoader:
        inference_loader = DataLoader(inference, batch_size=self.HP['batch_size'], shuffle=True)
        return inference_loader
    
    def _get_model(self) -> Tuple[EsgBert, torch.optim.Optimizer]:
        model = EsgBert().to(device)
        optim = AdamW(model.parameters(), lr=1e-5)
        return model, optim
    
    @torch.no_grad()
    def inference(self, model: EsgBert, optim: torch.optim.Optimizer, inference_loader):
        count = 0
        model.eval()
        # collect output
        output_label = []
        loop = tqdm(inference_loader, leave=True)
        for batch_id, batch in enumerate(loop):
            # reset
            optim.zero_grad()
            inputs = batch
            input_ids = inputs['input_ids'].to(self.device)
            attention_mask = inputs['attention_mask'].to(self.device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs = torch.argmax(outputs, dim=1)
            #  collect output into list
            outputs = outputs.cpu().tolist()
            output_label.extend(outputs)
            if batch_id % 50 == 0 and batch_id != 0:
                print(f'Epoch {batch_id}, count is {count}')
        self._gen_csv(output_label)
        
    def _gen_csv(self, output_label):
        self.df['esgbert'] = output_label
        self.df.to_csv(f'{self.CSV_OUTPUT}/{self.NATION}/test.csv', index=False)
        
    def main(self):
        x = self._read_csv()
        inference_encodings = self._tokenizer(x)
        inference_dataset = self._get_dataset(inference_encodings)
        inference_loader = self._get_dataloader(inference_dataset)
        model, optim = self._get_model()
        self.inference(model, optim, inference_loader)

In [20]:
HYPER_PARAMETERS = {
    'batch_size': 8,
    'lr': 1e-5,
    'epochs': 3,
}

In [21]:
CSV_SOURCE = '../../data/csv_source'
CSV_OUTPUT = '../../data/csv_output'
NATION = ''

In [22]:
esgbert = EsgBertPredict(CSV_SOURCE, CSV_OUTPUT, NATION, HYPER_PARAMETERS)
esgbert.main()

 68%|██████▊   | 53/78 [00:05<00:01, 17.78it/s]

Epoch 50, count is 0


100%|██████████| 78/78 [00:06<00:00, 12.03it/s]
