In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 2. Load data & label mapping
df = pd.read_csv('Tokopedia_preprocessed.csv')
sent_map   = {'Netral':0, 'Positif':1, 'Negatif':2}
aspect_map = {'Pelayanan':0, 'Kualitas':1, 'Harga':2}
df['sent_lbl']   = df['Sentimen'].map(sent_map)
df['aspect_lbl'] = df['Aspek'].map(aspect_map)

# 3. Tokenize untuk Word2Vec
sentences = [s.split() for s in df['clean_content'].astype(str)]

# 4. Train Word2Vec di korpus kita
w2v = Word2Vec(sentences,
               vector_size=100,
               window=5,
               min_count=1,
               workers=4,
               epochs=10)

# 5. Bangun vocab + matriks embedding
vocab = {'<pad>':0, '<unk>':1}
for word in w2v.wv.index_to_key:
    if word not in vocab:
        vocab[word] = len(vocab)
vocab_size = len(vocab)
emb_dim    = w2v.vector_size

embedding_matrix = np.zeros((vocab_size, emb_dim), dtype=np.float32)
# idx=1 (<unk>) random atau tetap nol
embedding_matrix[1] = np.random.normal(size=(emb_dim,))
for w, i in vocab.items():
    if w in w2v.wv:
        embedding_matrix[i] = w2v.wv[w]

emb_tensor = torch.tensor(embedding_matrix)

# 6. Split train/test 80:20 stratify sentimen
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['sent_lbl']
)

# 7. Dataset class
MAX_LEN = 128
class W2VDataset(Dataset):
    def __init__(self, df_subset):
        self.texts = df_subset['clean_content'].tolist()
        self.sent  = df_subset['sent_lbl'].tolist()
        self.asp   = df_subset['aspect_lbl'].tolist()
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        toks = self.texts[i].split()
        idxs = [vocab.get(t,1) for t in toks]
        if len(idxs)>=MAX_LEN:
            idxs = idxs[:MAX_LEN]
        else:
            idxs += [0]*(MAX_LEN-len(idxs))
        return (torch.tensor(idxs, dtype=torch.long),
                torch.tensor(self.sent[i], dtype=torch.long),
                torch.tensor(self.asp [i], dtype=torch.long))

train_ds = W2VDataset(train_df)
test_ds  = W2VDataset(test_df)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=32)

# 8. CNN-only model dengan Word2Vec embeddings
class W2VCnnMulti(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding.from_pretrained(emb_tensor,
                                                  freeze=False,
                                                  padding_idx=0)
        self.conv  = nn.Conv1d(in_channels=emb_dim,
                               out_channels=300,
                               kernel_size=5)
        self.pool  = nn.AdaptiveMaxPool1d(1)
        self.drop  = nn.Dropout(0.5)
        self.fc_sent = nn.Linear(300, 3)
        self.fc_asp  = nn.Linear(300, 3)

    def forward(self, x):
        # x: (B, MAX_LEN)
        e = self.embed(x)           # (B, MAX_LEN, emb_dim)
        e = e.permute(0,2,1)        # (B, emb_dim, MAX_LEN)
        c = torch.relu(self.conv(e))# (B,300,MAX_LEN-4)
        p = self.pool(c).squeeze(-1)# (B,300)
        d = self.drop(p)
        return self.fc_sent(d), self.fc_asp(d)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_w2v = W2VCnnMulti().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_w2v.parameters(), lr=1e-3, weight_decay=0.01)

# 9. Training (10 epochs)
for epoch in range(1,11):
    model_w2v.train()
    tot_loss=0; corr_s=0; corr_a=0
    for ids, sl, al in train_loader:
        ids, sl, al = ids.to(device), sl.to(device), al.to(device)
        optimizer.zero_grad()
        out_s, out_a = model_w2v(ids)
        loss = criterion(out_s, sl) + criterion(out_a, al)
        loss.backward(); optimizer.step()
        tot_loss += loss.item()*ids.size(0)
        corr_s += (out_s.argmax(1)==sl).sum().item()
        corr_a += (out_a.argmax(1)==al).sum().item()
    n=len(train_ds)
    print(f"[W2V-CNN] Epoch {epoch} — Loss: {tot_loss/n:.4f} "
          f"Sent Acc: {corr_s/n:.4f} Asp Acc: {corr_a/n:.4f}")

# 10. Evaluasi pada 20% test set
model_w2v.eval()
all_sp,all_st,all_ap,all_at = [],[],[],[]
with torch.no_grad():
    for ids, sl, al in test_loader:
        ids = ids.to(device)
        ps, pa = model_w2v(ids)
        all_sp += ps.argmax(1).cpu().tolist()
        all_ap += pa.argmax(1).cpu().tolist()
        all_st += sl.tolist()
        all_at += al.tolist()

print("=== Word2Vec-CNN Sentimen ===")
print(classification_report(all_st, all_sp,
                            target_names=['Netral','Positif','Negatif'],
                            digits=4))
print(f"Sentiment Accuracy: {accuracy_score(all_st, all_sp)*100:.2f}%\n")

print("=== Word2Vec-CNN Aspek ===")
print(classification_report(all_at, all_ap,
                            target_names=['Pelayanan','Kualitas','Harga'],
                            digits=4))
print(f"Aspect Accuracy   : {accuracy_score(all_at, all_ap)*100:.2f}%")


[W2V-CNN] Epoch 1 — Loss: 1.4960 Sent Acc: 0.5727 Asp Acc: 0.7386
[W2V-CNN] Epoch 2 — Loss: 1.1766 Sent Acc: 0.6660 Asp Acc: 0.8317
[W2V-CNN] Epoch 3 — Loss: 1.0754 Sent Acc: 0.6969 Asp Acc: 0.8499
[W2V-CNN] Epoch 4 — Loss: 1.0183 Sent Acc: 0.7170 Asp Acc: 0.8576
[W2V-CNN] Epoch 5 — Loss: 0.9797 Sent Acc: 0.7320 Asp Acc: 0.8686
[W2V-CNN] Epoch 6 — Loss: 0.9549 Sent Acc: 0.7440 Asp Acc: 0.8690
[W2V-CNN] Epoch 7 — Loss: 0.9305 Sent Acc: 0.7532 Asp Acc: 0.8756
[W2V-CNN] Epoch 8 — Loss: 0.8985 Sent Acc: 0.7758 Asp Acc: 0.8781
[W2V-CNN] Epoch 9 — Loss: 0.8471 Sent Acc: 0.7983 Asp Acc: 0.8832
[W2V-CNN] Epoch 10 — Loss: 0.8101 Sent Acc: 0.8159 Asp Acc: 0.8909
=== Word2Vec-CNN Sentimen ===
              precision    recall  f1-score   support

      Netral     0.7143    0.9091    0.8000       561
     Positif     0.9876    0.8318    0.9030       767
     Negatif     0.8219    0.7827    0.8018       672

    accuracy                         0.8370      2000
   macro avg     0.8413    0.8412    