In [None]:
import urllib.request
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer,BertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv", filename="finance_data.csv")
data = pd.read_csv('finance_data.csv')

In [None]:
data['labels'] = data['labels'].replace(['neutral', 'positive', 'negative'],[0, 1, 2])
data.drop(columns=['sentence'],inplace=True)
data.drop_duplicates(subset=['kor_sentence'],inplace=True)

In [None]:
X_data = data['kor_sentence']
y_data = data['labels']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0, stratify=y_data)
X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [None]:
class CustomDataset(Dataset):

    def __init__(self,text, label,tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.labels = label
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.float),
            }



training_set = CustomDataset(X_train,y_train, tokenizer, 128)
testing_set = CustomDataset(X_test, y_test,tokenizer, 128)

train_params = {'batch_size': 16,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 16,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model = BertForSequenceClassification.from_pretrained("klue/bert-base",num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
LEARNING_RATE = 3e-5
EPOCH=4
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

train_steps = len(training_loader.dataset) // train_params['batch_size']
val_steps = len(testing_loader.dataset) // test_params['batch_size']

In [None]:
for epoch in (range(EPOCH)):
    model.train()
    num_correct = 0
    for i, data in enumerate(tqdm(training_loader)):

        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        label = data['labels'].to(device, dtype = torch.long)



        optimizer.zero_grad()

        output = model(ids, mask, token_type_ids)
        loss = loss_fn(output[0], label)
        loss.backward()
        optimizer.step()

        pred = output[0].argmax(dim=1)
        num_correct += torch.eq(pred, label).sum().float().item()


    print('EPOCH ', epoch+1)
    print("Training Losses: {}".format(loss))
    print("Trainong Accuracy:{}".format(num_correct/len(training_loader.dataset)))



In [None]:
model.eval()
num_correct = 0
for i, data in enumerate(tqdm(testing_loader)):

    ids = data['input_ids'].to(device, dtype = torch.long)
    mask = data['attention_mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    label = data['labels'].to(device, dtype = torch.long)



    optimizer.zero_grad()

    output = model(ids, mask, token_type_ids)
    loss = loss_fn(output[0], label)

    pred = output[0].argmax(dim=1)
    num_correct += torch.eq(pred, label).sum().float().item()

print("Validation Losses: {}".format(loss))
print("Validation Accuracy:{}".format(num_correct/len(testing_loader.dataset)))

