## import package

In [1]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
import numpy as np
import time
import torch
import pandas as pd
import re
import os
import random
from sklearn.model_selection import train_test_split
from transformers import set_seed
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
set_seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
READ_DIRECTORY = './25company/25_english_gpt_label_Tino.csv'

## Data process

In [2]:
df = pd.read_csv(READ_DIRECTORY, dtype={'label': object})

In [4]:
import ast
df['label'] = df['label'].apply(ast.literal_eval)

In [5]:
def convert_onehot(row):
    y = []
    for i in range(27):
        if i in row:
            y.append(1)
        else:
            y.append(0)
    return y

In [6]:
df['label'] = df['label'].apply(convert_onehot)

In [7]:
df

Unnamed: 0,paragraph,label
0,7 at the intersection of science and sustainab...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
1,43 advancing with an intentional focus on incl...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,87 reporting disclosure tracking our progress ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,14our sustainability strategy 16protectingourc...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,64 investing in our communities 66 advancing s...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...
9206,we incorporate respect for human rights into t...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
9207,conducting a review of the human rights implic...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
9208,we publish an annual uk modern slavery act tra...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9209,as a sponsor of major sports bodies and events...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [8]:
x = df['paragraph']
y = df['label']

In [None]:
x, y = SMOTE(random_state=42).fit_resample(x, y)

## Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')

In [None]:
train_encodings = tokenizer(x.to_list(), truncation=True, padding=True)

In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

## Dataset

In [None]:
class qrDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, y):
        self.encodings = encodings
        self.y = y
    def __getitem__(self, idx):
        input_ids =  {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        label = torch.tensor(self.y[idx])
        return input_ids, label

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
x_dataset = qrDataset(train_encodings, y)

In [None]:
train, test= train_test_split(x_dataset, test_size=0.2, random_state=45)
valid, test= train_test_split(test, test_size=0.5, random_state=42)

In [None]:
X_re, y_re = SMOTE(random_state=42).fit_resample(train, y_train)

In [None]:
next(iter(train))

## Model

In [None]:
from transformers import AutoModelForSequenceClassification
from torch.nn import LogSoftmax
class myModel(torch.nn.Module):

    def __init__(self):

        super(myModel, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained('nbroad/ESG-BERT')
        self.fc = nn.Linear(26, 27)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        logits = output[0]
        out = self.fc(logits)
        return out



In [None]:
from transformers import AdamW
from tqdm import tqdm

# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
model = myModel().to(device)

optim = AdamW(model.parameters(), lr=1e-5)

## Training

In [None]:
# Pack data into dataloader by batch
batch_size = 8
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)

In [None]:
training_epoch = 1

In [None]:
class_weight = torch.FloatTensor([1.40989, 2.56608, 2.54729, 2.54357, 10.41796, 4.25379, 6.10456, 7.04372, 0.73193,
           3.36518, 1.34659, 2.20228, 2.16393, 1.64287, 7.24917, 4.57842, 41.42381, 1.22006,
           6.1477, 4.09365, 4.03666, 44.61026, 11.99862, 3.74151, 2.25363, 1.53422, 4.46103]
                                ).to(device)
loss_fct = nn.BCELoss(weight=class_weight)
criterion = nn.Sigmoid()

In [None]:
# loss_fct = nn.BCELoss()
# criterion = LogSoftmax(dim=1)
# loss_fct = CrossEntropyLoss()

In [None]:
for epoch in range(training_epoch):
    model.train()
    running_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        inputs, y = batch

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        y = y.to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = criterion(outputs)
        loss = loss_fct(outputs, y.float())

        # calculate loss
        loss.backward()
        # update parameters
        optim.step()

        running_loss += loss.item()
        if batch_id % 50 == 0 and batch_id != 0:
            print(f'Epoch {epoch} Batch {batch_id} Loss {running_loss / 50:.4f}')
            running_loss = 0.0

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
torch.save(model.state_dict(), './model/' + 'GPT_Multi_lr1e-5')

## test dataset & dataloader

In [None]:
batch_size = 8
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False)

## Test import setup

In [None]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
import numpy as np
import time
import torch
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from transformers import set_seed
set_seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')
class qrDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, y):
        self.encodings = encodings
        self.y = y
    def __getitem__(self, idx):
        input_ids =  {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        label = torch.tensor(self.y[idx])
        return input_ids, label

    def __len__(self):
        return len(self.encodings.input_ids)
from transformers import AutoModelForSequenceClassification
from torch.nn import LogSoftmax
class myModel(torch.nn.Module):

    def __init__(self):

        super(myModel, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained('nbroad/ESG-BERT')
        self.fc = nn.Linear(26, 27)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        logits = output[0]
        out = self.fc(logits)
        return out
from transformers import AdamW
from tqdm import tqdm

# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
model = myModel().to(device)

optim = AdamW(model.parameters(), lr=1e-5)


## test predict

In [None]:
apple = pd.read_csv('./apple/apple_more_than_10.csv')

In [None]:
test_apple_x, test_apple_y = apple['paragraph'], apple['result']
test_encodings = tokenizer(test_apple_x.to_list(), truncation=True, padding=True)
x_dataset = qrDataset(test_encodings, test_apple_y)

In [None]:
# Pack data into dataloader by batch
batch_size = 8
test_loader = DataLoader(x_dataset, batch_size=batch_size, shuffle=False)

In [None]:
model.load_state_dict(torch.load('./model/GPT_Multi_lr1e-5'))

In [None]:
loss_fct = nn.BCELoss()
criterion = nn.Sigmoid()
sigmoid = nn.Sigmoid()

In [None]:
count = 0
model.eval()
# collect output
output_label = []
loop = tqdm(test_loader, leave=True)
for batch_id, batch in enumerate(loop):
    # reset
    optim.zero_grad()
    inputs, y = batch

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    y = y.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    # print(outputs)
    # outputs = torch.argmax(outputs, dim=1)
    outputs = sigmoid(outputs)
    print(outputs)
    threshold = 0.28
    predicted_labels = (outputs > threshold).int()
    predicted_labels = predicted_labels.cpu().tolist()
    output_label.extend(predicted_labels)
    if batch_id % 50 == 0 and batch_id != 0:
        print(f'Epoch {batch_id}, count is {count}')
test_len = len(test_loader.dataset)
print(count / test_len)

## csv output

In [None]:
apple['gpt_tune'] = output_label

In [None]:
def convert_onehot(row):
    y = []
    for i in range(27):
        if row[i]==1:
            y.append(i)
    if(len(y)==0):
        y.append(26)
    return y

In [None]:
apple['x'] = apple['gpt_tune'].apply(convert_onehot)

In [None]:
apple

In [None]:
apple.to_csv('./apple/apple_multilabel_v2.csv', index=False)

# Appendix

In [None]:
for epoch in range(training_epoch):
    model.train()
    running_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        inputs, y = batch

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        y = y.to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # print(outputs)
        outputs = criterion(outputs)
        # print(outputs)
        # outputs = torch.argmax(outputs, dim=1)
        # print((outputs.dtype))
        # print((y.dtype))
        # print(outputs)
        # print(outputs.device)
        # print(y.device)
        # print(outputs.size(), y.size())
        loss = loss_fct(outputs, y.float())

        # calculate loss
        loss.backward()
        # update parameters
        optim.step()

        running_loss += loss.item()
        if batch_id % 50 == 0 and batch_id != 0:
            print(f'Epoch {epoch} Batch {batch_id} Loss {running_loss / 50:.4f}')
            running_loss = 0.0

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
count = 0
model.eval()
# collect output
output_label = []
loop = tqdm(test_loader, leave=True)
for batch_id, batch in enumerate(loop):
    # reset
    optim.zero_grad()
    inputs, y = batch

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    y = y.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = torch.argmax(outputs, dim=1)
    count += torch.sum(torch.eq(outputs, y)).item()
    outputs = outputs.cpu().tolist()
    output_label.extend(outputs)
    if batch_id % 50 == 0 and batch_id != 0:
        print(f'Epoch {batch_id}, count is {count}')
test_len = len(test_loader.dataset)
print(count / test_len)

In [None]:
from tqdm import tqdm
for i in range(10):
    criterion = LogSoftmax(dim=1)
    count = 0
    model.eval()
    # collect output
    output_label = []
    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        inputs, y = batch

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        y = y.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = torch.argmax(outputs, dim=1)
        count += torch.sum(torch.eq(outputs, y)).item()
        #  collect output into list
        outputs = outputs.cpu().tolist()
        output_label.extend(outputs)
        if batch_id % 50 == 0 and batch_id != 0:
            print(f'Epoch {batch_id}, count is {count}')
    test_len = len(test_loader.dataset)
    print(count / test_len)
    apple['gpt_tune'] = output_label
    apple.to_csv(f'apple_3&gpt_label{i}.csv', index=False)

In [5]:
def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, class_sep=2,
                               weights=[0.1,0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                               n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10)
    y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), y

def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_sample, neigh=5):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

In [6]:
X, y = create_dataset()  # Creating a Dataframe
X_sub, y_sub = get_minority_samples(X, y)  # Getting minority samples of that datframe
X_res, y_res = MLSMOTE(X_sub, y_sub, 100, 5)  # Applying MLSMOTE to augment the dataframe

In [9]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.700263,0.602477,-2.478509,0.433484,1.109780,-0.246891,0.525389,-1.902413,1.991422,-1.848972
1,2.267874,-0.350847,2.187252,-0.769722,0.601022,-0.234049,-1.844444,2.092082,2.299990,-2.585919
2,-1.207028,0.230167,-2.035403,-0.401339,0.595955,2.066251,-2.311038,-1.804673,2.197500,-2.106087
3,-1.326205,-1.417870,2.085587,-0.453313,0.457689,0.868153,0.784653,2.348514,2.546447,-2.829462
4,-0.781795,1.495530,-2.194913,-0.264658,0.906237,-0.633937,-0.907535,-2.706511,1.255595,-1.120307
...,...,...,...,...,...,...,...,...,...,...
995,-2.287050,-0.298746,1.586469,-0.464742,1.103363,1.050799,0.960210,-2.157864,2.600376,-2.887021
996,-0.027245,1.842409,1.793464,0.013021,0.980886,-0.037242,-0.309943,-2.104098,-2.168505,2.055803
997,0.833553,-0.130508,-1.954786,-0.074340,-0.410230,0.837448,-1.627014,-1.333642,2.935449,-2.876682
998,-1.088050,-0.213819,-2.207270,-1.106803,-1.096291,0.722919,0.833781,-2.433188,1.347741,-1.211900


In [10]:
y

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,False,False,False,False,True
1,False,False,True,False,False
2,False,False,False,False,True
3,False,False,True,False,False
4,False,False,False,False,True
...,...,...,...,...,...
995,True,False,False,False,False
996,False,True,False,False,False
997,False,False,False,False,True
998,False,False,False,False,True


In [7]:
y_res.head()

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [8]:
X_res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.89467,-0.527497,1.001114,1.724863,-0.626351,1.223244,1.477706,-4.627311,-3.022126,2.993959
1,1.788839,2.414019,3.049652,-0.826022,-1.512151,-2.539101,-0.141751,0.425306,-2.580201,2.388638
2,3.024502,1.609134,2.260266,-1.272302,-1.932562,0.33251,-0.584202,-2.847786,-1.622824,1.433473
3,-0.787481,0.806771,1.828351,0.054367,1.385933,1.723669,0.887199,-4.042022,-2.782947,2.66995
4,-2.437923,-1.684294,1.848106,0.473394,3.015928,-0.143267,0.825287,-3.345422,-4.139679,4.087703
