In [1]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

PyTorch Version: 2.6.0+cu124
CUDA Available: True
CUDA Version: 12.4
GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU


{'Negatif', 'Positif', 'Netral'}


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder

In [18]:
# Load dataset
train_df = pd.read_csv("./dataset/rev/cleaned-trainDataset-HgSekar.csv")
valid_df = pd.read_csv("./dataset/rev/cleaned-valDataset-chatGPT.csv")

In [20]:
print(set(valid_df['label']) - set(train_df['label']))  # Cek label di validasi yang tidak ada di train

set()


In [23]:
valid_df.head()

# Label Encoding (Negatif = 0, Netral = 1, Positif = 2)
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
valid_df['label'] = label_encoder.transform(valid_df['label'])

# Load tokenizer IndoBERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenisasi menggunakan IndoBERT
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Buat Dataset & DataLoader
train_dataset = SentimentDataset(train_df['comment'], train_df['label'], tokenizer)
valid_dataset = SentimentDataset(valid_df['comment'], valid_df['label'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)


In [24]:
import torch.nn as nn
from transformers import AutoModel

class IndoBERT_BiLSTM(nn.Module):
    def __init__(self, bert_model="indobenchmark/indobert-base-p1", lstm_hidden=128, num_classes=3):
        super(IndoBERT_BiLSTM, self).__init__()
        
        # Load IndoBERT sebagai feature extractor
        self.bert = AutoModel.from_pretrained(bert_model)
        self.bert.requires_grad_(False)  # Freeze semua layer IndoBERT
        
        # BiLSTM Layer
        self.lstm = nn.LSTM(input_size=768, hidden_size=lstm_hidden, num_layers=2, 
                            batch_first=True, bidirectional=True, dropout=0.3)
        
        # Batch Normalization untuk stabilitas training
        self.batch_norm = nn.BatchNorm1d(lstm_hidden * 2)
        
        # Fully Connected Layer
        self.fc = nn.Linear(lstm_hidden * 2, num_classes)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Ambil vektor CLS sebagai representasi kalimat
        bert_embedding = bert_output.last_hidden_state[:, 0, :]
        
        # Masukkan ke BiLSTM
        lstm_out, _ = self.lstm(bert_embedding.unsqueeze(1))
        lstm_out = lstm_out[:, -1, :]  # Ambil output terakhir BiLSTM
        
        # Normalisasi + Dropout
        lstm_out = self.batch_norm(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        # Klasifikasi
        output = self.fc(lstm_out)
        return output

In [25]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Inisialisasi model
model = IndoBERT_BiLSTM().to(device)

# Loss dan optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Training Loop dengan Early Stopping
epochs = 15
best_valid_loss = float('inf')
patience = 3  # Early stopping jika tidak ada perbaikan setelah 3 epoch
patience_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss, total_acc = 0, 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_acc += (outputs.argmax(dim=1) == labels).sum().item()
    
    train_loss = total_loss / len(train_loader)
    train_acc = total_acc / len(train_df)
    
    # Evaluasi
    model.eval()
    valid_loss, valid_acc = 0, 0
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            valid_loss += loss.item()
            valid_acc += (outputs.argmax(dim=1) == labels).sum().item()
    
    valid_loss /= len(valid_loader)
    valid_acc /= len(valid_df)
    
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | Valid Loss={valid_loss:.4f}, Valid Acc={valid_acc:.4f}")
    
    # Early Stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "best_model.pth")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


Epoch 1: Train Loss=0.8556, Train Acc=0.6225 | Valid Loss=0.7428, Valid Acc=0.5948
Epoch 2: Train Loss=0.8075, Train Acc=0.6480 | Valid Loss=0.7722, Valid Acc=0.5260
Epoch 3: Train Loss=0.7910, Train Acc=0.6587 | Valid Loss=0.6385, Valid Acc=0.6629
Epoch 4: Train Loss=0.7841, Train Acc=0.6609 | Valid Loss=0.8138, Valid Acc=0.5091
Epoch 5: Train Loss=0.7793, Train Acc=0.6650 | Valid Loss=0.6678, Valid Acc=0.6050
Epoch 6: Train Loss=0.7746, Train Acc=0.6655 | Valid Loss=0.7713, Valid Acc=0.5234
Early stopping triggered.


In [26]:
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

def predict(text, model, tokenizer):
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        output = model(input_ids, attention_mask)
    
    return label_encoder.inverse_transform([torch.argmax(output).item()])[0]

# Contoh prediksi
print(predict("Produk ini luar biasa!", model, tokenizer))


2


In [27]:
print(predict("Produk ini jelek!", model, tokenizer))

0


In [None]:
print(predict("Produk ini", model, tokenizer))

0
