## import package

In [2]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
import numpy as np
import time
import torch
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from transformers import set_seed
set_seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Tokenizer

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('nbroad/ESG-BERT')

## Dataset

In [4]:
class qrDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        input_ids =  {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return input_ids

    def __len__(self):
        return len(self.encodings.input_ids)

## Model

In [5]:
from transformers import AutoModelForSequenceClassification
from torch.nn import LogSoftmax
class myModel(torch.nn.Module):

    def __init__(self):

        super(myModel, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained('nbroad/ESG-BERT')
        self.fc = nn.Linear(26, 27)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        logits = output[0]
        out = self.fc(logits)
        return out

In [6]:
batch_size = 8
from transformers import AdamW
from tqdm import tqdm

# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
model = myModel().to(device)

optim = AdamW(model.parameters(), lr=1e-5)
model.load_state_dict(torch.load('./model/GPT_Tino_downsampling_lr1e-5'))



<All keys matched successfully>

## test predict

In [6]:
file_list = [
'1_dow_1_85.pdf', '2_agilent_1_106.pdf', '3_amazon_1_79.pdf', '4_apple_1_72.pdf',
'5_boeing_1_66.pdf', '6_bxp_1_65.pdf', '7_charles_1_50.pdf', '8_cisco_1_56.pdf', '9_citigroup_1_137.pdf', '10_cme_1_34.pdf', 
'11_colgate_1_84.pdf', '12_corning_1_71.pdf', '13_expeditor_1_37.pdf', '14_eei_1_80.pdf', '15_itt_1_44.pdf', 
'16_fedex_1_34.pdf', '17_firstscolar_1_57.pdf', '18_google_1_14.pdf', '19_intel_1_86.pdf', '20_jpmorgan_1_61.pdf', 
'21_microsoft_1_89.pdf', '22_rockwell_1_58.pdf', '23_ibm_1_49.pdf', '24_traveler_1_147.pdf', '25_visa_1_52.pdf']

In [7]:
file_list = sorted(os.listdir('taiwan_xml'))

In [7]:
for file in file_list:
    file = file.strip('.pdf')
    df = pd.read_csv(f'./xml_american/{file}.csv')
    test_apple_x = df['paragraph']
    test_encodings = tokenizer(test_apple_x.to_list(), truncation=True, padding=True)
    x_dataset = qrDataset(test_encodings)
    test_loader = DataLoader(x_dataset, batch_size=batch_size, shuffle=False)
    
    
    criterion = LogSoftmax(dim=1)
    model.eval()
    # collect output
    output_label = []
    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        inputs = batch

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = torch.argmax(outputs, dim=1)
        #  collect output into list
        outputs = outputs.cpu().tolist()
        output_label.extend(outputs)
        if batch_id % 50 == 0 and batch_id != 0:
            print(f'Epoch {batch_id}')
    df['label'] = output_label
    df.to_csv(f'./predict_american_xml/{file}.csv', index=False)

 75%|███████▍  | 53/71 [00:03<00:01, 16.55it/s]

Epoch 50


100%|██████████| 71/71 [00:04<00:00, 15.68it/s]
 93%|█████████▎| 54/58 [00:03<00:00, 16.96it/s]

Epoch 50


100%|██████████| 58/58 [00:03<00:00, 16.96it/s]
 43%|████▎     | 52/121 [00:04<00:06, 10.66it/s]

Epoch 50


 84%|████████▍ | 102/121 [00:09<00:01, 10.66it/s]

Epoch 100


100%|██████████| 121/121 [00:11<00:00, 10.70it/s]
 54%|█████▍    | 52/96 [00:03<00:03, 13.10it/s]

Epoch 50


100%|██████████| 96/96 [00:07<00:00, 13.10it/s]
 73%|███████▎  | 52/71 [00:06<00:02,  8.24it/s]

Epoch 50


100%|██████████| 71/71 [00:08<00:00,  8.33it/s]
100%|██████████| 30/30 [00:03<00:00,  9.55it/s]
100%|██████████| 47/47 [00:02<00:00, 16.41it/s]
100%|██████████| 48/48 [00:03<00:00, 15.86it/s]
100%|██████████| 53/53 [00:03<00:00, 16.43it/s]


Epoch 50


100%|██████████| 27/27 [00:01<00:00, 20.48it/s]
100%|██████████| 29/29 [00:01<00:00, 15.27it/s]
 70%|███████   | 52/74 [00:04<00:01, 12.12it/s]

Epoch 50


100%|██████████| 74/74 [00:06<00:00, 12.12it/s]
100%|██████████| 35/35 [00:02<00:00, 15.24it/s]
100%|██████████| 28/28 [00:02<00:00, 12.45it/s]
100%|██████████| 31/31 [00:01<00:00, 16.74it/s]
100%|██████████| 29/29 [00:01<00:00, 16.64it/s]
100%|██████████| 20/20 [00:02<00:00,  6.78it/s]
100%|██████████| 10/10 [00:00<00:00, 24.97it/s]
 51%|█████     | 52/102 [00:04<00:04, 10.79it/s]

Epoch 50


100%|██████████| 102/102 [00:09<00:00, 10.86it/s]


Epoch 100


 79%|███████▉  | 52/66 [00:04<00:01, 11.86it/s]

Epoch 50


100%|██████████| 66/66 [00:05<00:00, 11.85it/s]
 56%|█████▋    | 54/96 [00:03<00:02, 16.51it/s]

Epoch 50


100%|██████████| 96/96 [00:05<00:00, 16.56it/s]
100%|██████████| 51/51 [00:02<00:00, 23.40it/s]


Epoch 50


100%|██████████| 39/39 [00:02<00:00, 16.46it/s]
 65%|██████▌   | 52/80 [00:07<00:04,  6.64it/s]

Epoch 50


100%|██████████| 80/80 [00:11<00:00,  6.72it/s]
100%|██████████| 50/50 [00:03<00:00, 12.75it/s]


In [None]:
from tqdm import tqdm

for i in range(1):
    criterion = LogSoftmax(dim=1)
    count = 0
    model.eval()
    # collect output
    output_label = []
    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()
        inputs, y = batch

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        y = y.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = torch.argmax(outputs, dim=1)
        count += torch.sum(torch.eq(outputs, y)).item()
        #  collect output into list
        outputs = outputs.cpu().tolist()
        output_label.extend(outputs)
        if batch_id % 50 == 0 and batch_id != 0:
            print(f'Epoch {batch_id}, count is {count}')
    test_len = len(test_loader.dataset)
    print(count / test_len)
    apple['gpt_tune'] = output_label
    apple.to_csv(f'apple_3&gpt_label{i}.csv', index=False)

In [20]:
criterion = LogSoftmax(dim=1)
count = 0
model.eval()
# collect output
output_label = []
loop = tqdm(test_loader, leave=True)
for batch_id, batch in enumerate(loop):
    # reset
    optim.zero_grad()
    inputs, y = batch

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    y = y.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    outputs = torch.argmax(outputs, dim=1)
    count += torch.sum(torch.eq(outputs, y)).item()
    #  collect output into list
    outputs = outputs.cpu().tolist()
    output_label.extend(outputs)
    if batch_id % 50 == 0 and batch_id != 0:
        print(f'Epoch {batch_id}, count is {count}')
test_len = len(test_loader.dataset)
print(count / test_len)

 52%|█████▏    | 53/102 [00:04<00:03, 13.29it/s]

Epoch 50, count is 185


100%|██████████| 102/102 [00:07<00:00, 13.27it/s]

Epoch 100, count is 360
0.44772447724477243





## csv output

In [21]:
apple['gpt_tune'] = output_label

In [22]:
apple.to_csv('apple_3&gpt_label.csv', index=False)