<a href="https://colab.research.google.com/github/sohomghosh/Finsim4_ESG/blob/main/FinSim4_ESG_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import os
import torch
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,classification_report,confusion_matrix,precision_recall_curve,auc
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import numpy as np
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path = "/content/FinSim4_ESG/data/"
task2_df = pd.read_json(path + '/sentences/Sustainability_sentences_train.json')

X = task2_df[['sentence']]
y = task2_df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = X_train.copy()
train_df['label']=y_train

val_df = X_test.copy()
val_df['label']=y_test

my_dict = {"sustainable": 0, "unsustainable": 1}

def update_cat(x):
    return my_dict[x]


train_df['label_text'] = train_df['label'].apply(lambda x: update_cat(x))
val_df['label_text'] = val_df['label'].apply(lambda x: update_cat(x))
train_df.reset_index(inplace=True, drop = True)
val_df.reset_index(inplace=True, drop = True)

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 256
EPOCHS = 60
LEARNING_RATE = 2e-05
model_name_details = "ROBERTA"
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
text_col_name = 'sentence'
category_col = 'label_text'
PATH = './task2_roberta/'

In [None]:
class Triage(Dataset):
    """
    This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training. 
    """

    def __init__(self, dataframe, tokenizer, max_len, text_col_name, category_col):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_col_name = text_col_name
        self.category_col = category_col

    def __getitem__(self, index):
        title = str(self.data[self.text_col_name][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(
                self.data[self.category_col][index], dtype=torch.long
            ),
        }

    def __len__(self):
        return self.len

In [None]:
training_set = Triage(train_df, tokenizer, MAX_LEN, text_col_name, category_col)
validation_set = Triage(val_df, tokenizer, MAX_LEN, text_col_name, category_col)

In [None]:
# data loader parameters
train_params = {"batch_size": TRAIN_BATCH_SIZE, "shuffle": True, "num_workers": 0}

test_params = {"batch_size": VALID_BATCH_SIZE, "shuffle": False, "num_workers": 0}

# creating dataloader for modelling
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(validation_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    """
    This is the modelling class which adds a classification layer on top of Roberta model. We finetune roberta while training for the label classification.
    """

    def __init__(self, num_class):
        super(BERTClass, self).__init__()
        self.num_class = num_class
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, self.num_class)
        self.history = dict()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
# initializing and moving the model to the appropriate device
model = BERTClass(len(my_dict))
model.to(device)

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accu(big_idx, targets):
    """
    This function compares the predicted output with ground truth to give the count of the correct predictions.
    """
    n_correct = (big_idx == targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    """
    Function to train the model. This function utilizes the model initialized using BERTClass. It trains the model and provides the accuracy on the training set.
    """
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.long)
        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % 250 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss per 250 steps: {loss_step}")
            print(f"Training Accuracy per 250 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f"The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}")
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu

In [None]:
def valid(model, testing_loader):
    """
    This function calculates the performance numbers on the validation set.
    """
    model.eval()
    n_correct = 0
    n_wrong = 0
    total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu

In [None]:
if not os.path.exists(PATH):
    os.makedirs(PATH)

# variable to store the model performance at the epoch level
model.history["train_acc"] = []
model.history["val_acc"] = []
model.history["train_loss"] = []
model.history["val_loss"] = []

# model training
for epoch in range(EPOCHS):
    print("Epoch number : ", epoch)
    train_loss, train_accu = train(epoch)
    val_loss, val_accu = valid(model, val_loader)
    model.history["train_acc"].append(train_accu)
    model.history["train_loss"].append(train_loss)
    model.history["val_acc"].append(val_accu)
    model.history["val_loss"].append(val_loss)
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        },
        PATH + "/epoch_" + str(epoch)+"_"+str(val_loss) + ".bin",
    )

In [None]:
def scoring_data_prep(dataset = training_set):
    out = []
    target = []
    mask = []
    tf_idf_feature = []
    for i in range(len(dataset)):
   
        rec = dataset[i]
        out.append(rec['ids'].reshape(-1,MAX_LEN))
        mask.append(rec['mask'].reshape(-1,MAX_LEN))
        target.append(rec['targets'])

 

        out_stack = torch.cat(out, dim = 0)
        mask_stack = torch.cat(mask, dim =0 )
        out_stack = out_stack.to(device, dtype = torch.long)
        mask_stack = mask_stack.to(device, dtype = torch.long)

 

        target_list = [i.item() for i in target]
    return out_stack, mask_stack, target_list

In [None]:
out_stack, mask_stack, target_list = scoring_data_prep(dataset = validation_set)
n = 0
batch_size = 500
combined_output = []
model.eval()
with torch.no_grad():
    while n < len(target_list):
        output = model(out_stack[n:n+batch_size,:],mask_stack[n:n+batch_size,:])
        n = n + batch_size
        combined_output.append(output)
        print(n)
    combined_output = torch.cat(combined_output, dim = 0)
    preds = torch.argsort(combined_output, axis = 1, descending = True)

In [None]:
preds = preds.to('cpu')
actual_predictions = [i[0] for i in preds.tolist()]
print(classification_report(target_list, actual_predictions))

print(accuracy_score(target_list, actual_predictions))