In [5]:
import pandas as pd
import random

random.seed(42)

# ==============================
# KOMPONEN BAHASA
# ==============================

subjek = [
    "Aplikasi ini", "Aplikasi mobile ini", "Sistem aplikasi",
    "Platform digital ini", "Layanan aplikasi", "Aplikasi yang digunakan"
]

objek = [
    "pengguna", "pengguna baru", "pengguna lama",
    "saya", "tim kerja", "pengguna aplikasi"
]

konteks = [
    "dalam penggunaan sehari-hari",
    "pada versi terbaru",
    "saat digunakan secara rutin",
    "di perangkat Android",
    "di perangkat iOS",
    "secara umum",
    "dalam kondisi normal",
    ""
]

positif = [
    "sangat membantu",
    "berjalan dengan sangat baik",
    "memberikan pengalaman yang memuaskan",
    "memudahkan pekerjaan",
    "memiliki performa yang stabil",
    "sangat berguna",
    "mudah digunakan",
    "responsif saat dijalankan",
    "meningkatkan produktivitas",
    "berfungsi dengan optimal"
]

negatif = [
    "sering mengalami error",
    "berjalan sangat lambat",
    "sulit untuk digunakan",
    "sering mengalami gangguan",
    "tidak berjalan dengan baik",
    "mengecewakan pengguna",
    "tidak responsif",
    "memiliki banyak bug",
    "menghambat pekerjaan",
    "bermasalah saat dijalankan"
]

netral = [
    "dapat digunakan",
    "tersedia untuk pengguna",
    "memiliki beberapa fitur",
    "menyediakan menu utama",
    "digunakan untuk kebutuhan tertentu",
    "berjalan sesuai fungsi dasar",
    "tersedia di berbagai platform",
    "memiliki tampilan standar",
    "digunakan oleh banyak pengguna",
    "menyediakan layanan digital"
]

# ==============================
# FUNGSI PEMBENTUK KALIMAT
# ==============================

def buat_kalimat(predikat_list):
    return f"{random.choice(subjek)} {random.choice(predikat_list)} bagi {random.choice(objek)} {random.choice(konteks)}".strip()

# ==============================
# DATA LATIH (300)
# ==============================

train_data = []

for _ in range(100):
    train_data.append([buat_kalimat(positif), "Positif"])
    train_data.append([buat_kalimat(negatif), "Negatif"])
    train_data.append([buat_kalimat(netral),  "Netral"])

df_train = pd.DataFrame(train_data, columns=["kalimat", "label"])
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

df_train.to_csv("data_latih_300.csv", index=False)

# ==============================
# DATA UJI (400)
# ==============================

test_sentences = []
semua_predikat = positif + negatif + netral

for _ in range(400):
    kalimat = f"{random.choice(subjek)} {random.choice(semua_predikat)} bagi {random.choice(objek)} {random.choice(konteks)}"
    test_sentences.append(kalimat.strip())

df_test = pd.DataFrame(test_sentences, columns=["kalimat"])
df_test.to_csv("data_uji_400.csv", index=False)

# ==============================
# CEK HASIL
# ==============================

print("âœ… Dataset berhasil dibuat\n")
print("Data Latih:")
print(df_train['label'].value_counts(), "\n")
print("Contoh Data Latih:")
print(df_train.head(5), "\n")

print("Data Uji:")
print("Jumlah:", len(df_test))
print("Contoh Data Uji:")
print(df_test.head(5))


âœ… Dataset berhasil dibuat

Data Latih:
label
Netral     100
Positif    100
Negatif    100
Name: count, dtype: int64 

Contoh Data Latih:
                                             kalimat    label
0  Sistem aplikasi memiliki beberapa fitur bagi p...   Netral
1  Aplikasi ini memiliki tampilan standar bagi pe...   Netral
2  Layanan aplikasi tersedia di berbagai platform...   Netral
3  Sistem aplikasi berjalan dengan sangat baik ba...  Positif
4  Aplikasi ini tersedia untuk pengguna bagi peng...   Netral 

Data Uji:
Jumlah: 400
Contoh Data Uji:
                                             kalimat
0  Aplikasi yang digunakan memiliki tampilan stan...
1  Platform digital ini memberikan pengalaman yan...
2  Aplikasi yang digunakan menyediakan menu utama...
3  Layanan aplikasi berjalan sangat lambat bagi p...
4  Aplikasi ini meningkatkan produktivitas bagi t...


In [6]:
# ==========================================
# TRANSFORMER SENTIMENT CLASSIFICATION
# FINAL PIPELINE - COLAB READY
# ==========================================

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import re
import os

# ==========================================
# 1. LOAD DATA
# ==========================================

df_train = pd.read_csv("data_latih_300.csv")
df_test  = pd.read_csv("data_uji_400.csv")

# ==========================================
# 2. PREPROCESSING
# ==========================================

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

df_train['cleaned'] = df_train['kalimat'].apply(clean_text)
df_test['cleaned']  = df_test['kalimat'].apply(clean_text)

# ==========================================
# 3. TOKENIZATION (VOCAB FROM TRAINING DATA)
# ==========================================

all_words = ' '.join(df_train['cleaned']).split()
vocab = sorted(set(all_words))
word_to_idx = {w: i + 1 for i, w in enumerate(vocab)}

MAX_LEN = 15

def tokenize(text):
    tokens = [word_to_idx.get(w, 0) for w in text.split()]
    return (tokens + [0] * MAX_LEN)[:MAX_LEN]

X_train = np.array(df_train['cleaned'].apply(tokenize).tolist())
X_test  = np.array(df_test['cleaned'].apply(tokenize).tolist())

# ==========================================
# 4. LABEL ENCODING
# ==========================================

le = LabelEncoder()
y_train = le.fit_transform(df_train['label'])

# ==========================================
# 5. DATASET & DATALOADER
# ==========================================

class TextDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx]) if self.y is not None else self.X[idx]

train_loader = DataLoader(TextDataset(X_train, y_train), batch_size=32, shuffle=True)

# ==========================================
# 6. TRANSFORMER MODEL
# ==========================================

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Parameter(torch.zeros(1, MAX_LEN, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x) + self.pos_embed
        x = self.encoder(x)
        x = x.mean(dim=1)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(
    vocab_size=len(word_to_idx) + 1,
    embed_dim=64,
    num_heads=8,
    num_layers=2,
    num_classes=len(le.classes_)
).to(device)

# ==========================================
# 7. TRAINING
# ==========================================

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 30
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(Xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(train_loader):.4f}")

# ==========================================
# 8. INFERENCE (400 TEST SENTENCES)
# ==========================================

model.eval()
with torch.no_grad():
    preds = torch.argmax(
        model(torch.LongTensor(X_test).to(device)),
        dim=1
    )

df_test['hasil_label_transformer'] = le.inverse_transform(preds.cpu().numpy())

# ==========================================
# 9. SAVE OUTPUT
# ==========================================

df_test.to_csv("hasil_pelabelan_400_kalimat.csv", index=False)

print("\nâœ… PROSES SELESAI")
print("ðŸ“„ File dihasilkan:")
print("- hasil_pelabelan_400_kalimat.csv")


Epoch 1/30 | Loss: 1.3930
Epoch 2/30 | Loss: 1.1298
Epoch 3/30 | Loss: 1.0842
Epoch 4/30 | Loss: 1.0657
Epoch 5/30 | Loss: 0.9943
Epoch 6/30 | Loss: 0.7450
Epoch 7/30 | Loss: 0.4195
Epoch 8/30 | Loss: 0.2376
Epoch 9/30 | Loss: 0.1226
Epoch 10/30 | Loss: 0.0572
Epoch 11/30 | Loss: 0.0226
Epoch 12/30 | Loss: 0.0103
Epoch 13/30 | Loss: 0.0071
Epoch 14/30 | Loss: 0.0059
Epoch 15/30 | Loss: 0.0054
Epoch 16/30 | Loss: 0.0049
Epoch 17/30 | Loss: 0.0047
Epoch 18/30 | Loss: 0.0042
Epoch 19/30 | Loss: 0.0040
Epoch 20/30 | Loss: 0.0037
Epoch 21/30 | Loss: 0.0035
Epoch 22/30 | Loss: 0.0034
Epoch 23/30 | Loss: 0.0033
Epoch 24/30 | Loss: 0.0031
Epoch 25/30 | Loss: 0.0030
Epoch 26/30 | Loss: 0.0027
Epoch 27/30 | Loss: 0.0026
Epoch 28/30 | Loss: 0.0025
Epoch 29/30 | Loss: 0.0024
Epoch 30/30 | Loss: 0.0023

âœ… PROSES SELESAI
ðŸ“„ File dihasilkan:
- hasil_pelabelan_400_kalimat.csv


In [7]:
from google.colab import files
files.download("hasil_pelabelan_400_kalimat.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
from google.colab import files
files.download('data_latih_300.csv')
files.download('data_uji_400.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>