In [1]:
import torch
import torch.nn as nn
from transformers import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")

In [None]:
import pandas as pd

train = pd.read_csv("sentimental_analysis/sentimental_analysis/train.csv")
test = pd.read_csv("sentimental_analysis/sentimental_analysis/test (1).csv")

train_new_row = train.columns
test_new_row = test.columns

train.columns = ['y_1', 'y_2', 'x']
new_index = len(train)
train.loc[new_index] = train_new_row

test.columns = ['y_1', 'y_2', 'x']
new_index = len(test)
test.loc[new_index] = test_new_row

train.dropna(inplace = True)
test.dropna(inplace= True)

train.reset_index(drop=True)
test.reset_index(drop=True)

train_texts = list(train['x'])
train_labels = list(train['y_1'])
val_texts = list(test['x'])
val_labels = list(test['y_1'])

train_labels[-1] = 1
val_labels[-1] = 0

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [None]:
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("monologg/kobert")
model.to(device)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=8, sampler=SequentialSampler(val_dataset))

In [None]:
epochs = 5
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr = 3e-5,
    total_steps = total_steps *2
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    total_loss = 0

    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch") as t:
        for batch in t:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask = attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            t.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Training lss: {avg_loss}")

In [None]:
model.eval()
val_loss, val_accuracy = 0, 0
num_val_steps = 0

for batch in val_loader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits

    val_loss += loss_fn(logits, labels).item()
    val_accuracy += (logits.argmax(dim=1) == labels).sum().item()
    num_val_steps += 1

avg_val_loss = val_loss / num_val_steps
val_accuracy = val_accuracy / len(val_dataset)
print(f"Epoch {epoch+1}/{epochs} - Validation loss: {avg_val_loss}, Validation accuracy: {val_accuracy}")

In [None]:
def inference(input_text):
    inputs = tokenizer.encode_plus(
        input_text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_label = logits.argmax(dim=1).item()

    return predicted_label

In [None]:
inference("좋아용")

**Multi label classificaiton**

In [4]:
import pandas as pd

train = pd.read_csv("sentimental_analysis/sentimental_analysis/train.csv")
test = pd.read_csv("sentimental_analysis/sentimental_analysis/test (1).csv")

train_new_row = train.columns
test_new_row = test.columns

train.columns = ['y_1', 'y_2', 'x']
new_index = len(train)
train.loc[new_index] = train_new_row

test.columns = ['y_1', 'y_2', 'x']
new_index = len(test)
test.loc[new_index] = test_new_row

train.dropna(inplace = True)
test.dropna(inplace= True)

train.reset_index(drop=True)
test.reset_index(drop=True)

train_texts = list(train['x'])
train_labels = list(train['y_2'])
val_texts = list(test['x'])
val_labels = list(test['y_2'])

train_labels[-1] = 1
val_labels[-1] = 0

for i in range(len(train_labels)):
    y = train_labels[i]
    train_labels[i] = y - 1

for i in range(len(val_labels)):
    y = val_labels[i]
    val_labels[i] = y - 1


In [5]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)

In [6]:
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

In [7]:
model2 = AutoModelForSequenceClassification.from_pretrained("monologg/kobert", num_labels=5)
model2.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [8]:
train_loader = DataLoader(train_dataset, batch_size=16, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=16, sampler=SequentialSampler(val_dataset))

In [9]:
epochs = 2
optimizer = AdamW(model2.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr = 3e-5,
    total_steps = total_steps *2
)

loss_fn = nn.CrossEntropyLoss().to(device)



In [10]:
from tqdm import tqdm

gradient_accumulation_steps = 4
progress_bar = tqdm(range(total_steps), desc="Training progress")
model2.zero_grad()
for step, batch in enumerate(train_loader):
    model2.train()
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device).long()

    # Forward pass
    outputs = model2(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Calculate loss
    loss = loss_fn(logits, labels)
    
    # Accumulate gradients
    loss = loss / gradient_accumulation_steps
    loss.backward()
    
    # Update optimizer and scheduler after gradient accumulation steps
    if (step + 1) % gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model2.zero_grad()
    
    # Update progress bar
    progress_bar.set_postfix({"loss": loss.item()})
    progress_bar.update(1)

progress_bar.close()

Training progress:  50%|█████     | 12313/24626 [28:42<28:42,  7.15it/s, loss=0.271]  


In [11]:
model2.save_pretrained("multi_label")

In [12]:
from sklearn.metrics import mean_squared_error
import numpy as np

model2.eval()
eval_predictions, eval_labels = [], []
for batch in val_loader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    
    with torch.no_grad():
        outputs = model2(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    eval_predictions.extend(logits.cpu().numpy())
    eval_labels.extend(labels.cpu().numpy())

eval_predictions = torch.tensor(eval_predictions).softmax(dim=1).numpy()
eval_labels = torch.tensor(eval_labels).numpy()

# Map probabilities to labels
eval_predictions = np.argmax(eval_predictions, axis=1)

# Calculate RMSE
rmse = mean_squared_error(eval_labels, eval_predictions, squared=False)

  eval_predictions = torch.tensor(eval_predictions).softmax(dim=1).numpy()


In [13]:
print(rmse)

1.7926702615558352


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

model2.eval()
eval_loss, eval_predictions, eval_labels = [], [], []
for batch in val_loader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    
    with torch.no_grad():
        outputs = model2(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    eval_predictions.extend(logits.cpu().numpy())
    eval_labels.extend(labels.cpu().numpy())
    loss = loss_fn(logits, labels)
    eval_loss.append(loss.item())

eval_loss = sum(eval_loss) / len(eval_loss)
eval_predictions = torch.sigmoid(torch.tensor(eval_predictions)).numpy()
eval_labels = torch.tensor(eval_labels).numpy()

# Calculate evaluation metrics
eval_predictions_binary = (eval_predictions > 0.5).astype(int)
precision = precision_score(eval_labels, eval_predictions_binary, average='micro')
recall = recall_score(eval_labels, eval_predictions_binary, average='micro')
f1 = f1_score(eval_labels, eval_predictions_binary, average='micro')
auc_roc = roc_auc_score(eval_labels, eval_predictions, average='micro')

print(f"Evaluation loss: {eval_loss}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"AUC-ROC: {auc_roc}")