In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [9]:
# Load dataset
data = pd.read_csv('./datasets/PRDECT-ID Dataset.csv')

sentiment_labels = {'Positive': 0, 'Negative': 1}

# Proses data untuk sentimen analisis
data['Sentiment'] = data['Sentiment'].apply(lambda x: sentiment_labels[x])

# Bagi dataset menjadi train dan validation
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Fungsi untuk tokenisasi dan encoding input
def tokenize_data(data, max_length=128):
    input_ids = []
    attention_masks = []

    for review in data['Customer Review']:
        encoded_dict = tokenizer.encode_plus(
            review,                      # Review untuk encode
            add_special_tokens = True,   # Tambahkan '[CLS]' dan '[SEP]'
            max_length = max_length,     # Panjang maksimum sequence
            truncation=True,  # Aktifkan truncation
            padding='max_length',     # Padding
            return_attention_mask = True, # Buat attention mask
            return_tensors = 'pt',       # Return PyTorch tensors
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert lists ke tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['Sentiment'].values)

    return input_ids, attention_masks, labels

# Tokenisasi data train dan validation
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data)

In [11]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name, num_labels_sentiment, num_labels_adv_dis):
        super(MultiTaskBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, num_labels_sentiment)
        self.adv_dis_classifier = nn.Linear(self.bert.config.hidden_size, num_labels_adv_dis)
        
    def forward(self, input_ids, attention_mask=None, task=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        if task == 'sentiment':
            logits = self.sentiment_classifier(pooled_output)
        elif task == 'adv_dis':
            logits = self.adv_dis_classifier(pooled_output)
        
        return logits

# Inisialisasi model
model_name = 'bert-base-multilingual-cased'
num_labels_sentiment = 2  # Positive, Negative, Neutral
num_labels_adv_dis = 2  # Advantages, Disadvantages

model = MultiTaskBERT(model_name, num_labels_sentiment, num_labels_adv_dis)

In [12]:
# Hyperparameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

# Dataset
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_mask, task='sentiment')
        loss = nn.CrossEntropyLoss()(outputs, b_labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}')

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_accuracy = 0
    total_steps = 0
    
    for batch in dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, task='sentiment')
        
        logits = outputs
        predictions = torch.argmax(logits, dim=1)
        accuracy = (predictions == b_labels).float().mean()
        total_accuracy += accuracy.item()
        total_steps += 1
    
    return total_accuracy / total_steps

# Evaluasi model pada data validasi
val_accuracy = evaluate(model, val_dataloader)
print(f'Validation Accuracy: {val_accuracy}')

Epoch 1/3, Training Loss: 0.2830592979229742
Epoch 2/3, Training Loss: 0.10371719092807707
Epoch 3/3, Training Loss: 0.06216702035932318
Validation Accuracy: 0.9650735294117647
