In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

final_datavidia_10_0_path = kagglehub.competition_download('final-datavidia-10-0')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitions/final-datavidia-10-0/sample_submission.csv
/kaggle/input/competitions/final-datavidia-10-0/yelp_academic_dataset_checkin.json
/kaggle/input/competitions/final-datavidia-10-0/yelp_academic_dataset_business.json
/kaggle/input/competitions/final-datavidia-10-0/yelp_academic_dataset_tip.json
/kaggle/input/competitions/final-datavidia-10-0/yelp_academic_dataset_user.json
/kaggle/input/competitions/final-datavidia-10-0/train.csv
/kaggle/input/competitions/final-datavidia-10-0/test.csv


In [None]:
# Di Kaggle, aktifkan internet dulu di Settings
!pip install transformers datasets torch -q

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import cohen_kappa_score
import warnings
import random
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using: {device}")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Using: cuda


In [None]:
BASE_PATH = '/kaggle/input/competitions/final-datavidia-10-0/'  # sesuaikan path

train_df = pd.read_csv(BASE_PATH + 'train.csv')
test_df = pd.read_csv(BASE_PATH + 'test.csv')

print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")
print(train_df['stars'].value_counts().sort_index())

Train: (5592155, 9)
Test: (1398125, 8)
stars
1.0     855429
2.0     435751
3.0     553595
4.0    1162091
5.0    2585289
Name: count, dtype: int64


In [None]:
def add_labels(row):
    return f"{row['text']}\n\n(useful = {row['useful']}, funny = {row['funny']}, cool = {row['cool']})"

train_df['text_with_labels'] = train_df.apply(add_labels, axis=1)
test_df['text_with_labels'] = test_df.apply(add_labels, axis=1)

In [None]:
# ===== OPSI A: ZERO-SHOT (paling cepat, baseline bagus) =====
from transformers import pipeline

# Load model yang sudah ditraining pada review 1-5 bintang
classifier = pipeline(
    "text-classification",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=0 if torch.cuda.is_available() else -1
)

def predict_stars_zeroshot(texts, batch_size=32):
    """Predict stars menggunakan nlptown model"""
    predictions = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # Truncate teks (max 512 token)
        batch_truncated = [str(t)[:512] for t in batch]

        results = classifier(batch_truncated, truncation=True, max_length=512)

        for r in results:
            # Label format: "1 star", "2 stars", dst
            star = int(r['label'].split()[0])
            predictions.append(star)

        if i % 1000 == 0:
            print(f"Progress: {i}/{len(texts)}")

    return predictions

# Test pada sample kecil dulu
sample_texts = test_df['text_with_labels'].fillna('').head(100).tolist()
sample_preds = predict_stars_zeroshot(sample_texts)
print("Sample predictions:", pd.Series(sample_preds).value_counts())

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Progress: 0/100
Sample predictions: 5    38
4    24
1    16
2    13
3     9
Name: count, dtype: int64


In [None]:
# ===== OPSI B: FINE-TUNING (akurasi lebih tinggi) =====
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

MODEL_NAME = "distilbert-base-uncased"  # lebih cepat dari BERT
# Atau: "nlptown/bert-base-multilingual-uncased-sentiment" untuk starting point lebih baik

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class YelpDataset(Dataset):
    def __init__(self, texts, labels=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])[:1000]  # Limit teks

        encoding = tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx] - 1, dtype=torch.long)  # 0-indexed

        return item

# Sample training (gunakan subset dulu untuk test)
SAMPLE_SIZE = 50000  # Mulai dengan 50k, tingkatkan jika waktu cukup

train_sample = train_df.sample(n=SAMPLE_SIZE, random_state=42)
texts_train = train_sample['text_with_labels'].fillna('').tolist()
labels_train = train_sample['stars'].astype(int).tolist()

train_dataset = YelpDataset(texts_train, labels_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=5  # 5 kelas (1-5 bintang)
)
model.to(device)

# Training
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(2):  # 2 epoch cukup untuk mulai
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} avg loss: {total_loss/len(train_loader):.4f}")

print("Training selesai!")

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch 1, Batch 0, Loss: 1.5953
Epoch 1, Batch 100, Loss: 0.9183
Epoch 1, Batch 200, Loss: 0.8973
Epoch 1, Batch 300, Loss: 1.0737
Epoch 1, Batch 400, Loss: 0.7437
Epoch 1, Batch 500, Loss: 0.5055
Epoch 1, Batch 600, Loss: 0.6333
Epoch 1, Batch 700, Loss: 0.8285
Epoch 1, Batch 800, Loss: 0.5183
Epoch 1, Batch 900, Loss: 0.5615
Epoch 1, Batch 1000, Loss: 0.6212
Epoch 1, Batch 1100, Loss: 0.5348
Epoch 1, Batch 1200, Loss: 1.0273
Epoch 1, Batch 1300, Loss: 0.5510
Epoch 1, Batch 1400, Loss: 0.7245
Epoch 1, Batch 1500, Loss: 0.7821
Epoch 1, Batch 1600, Loss: 0.6020
Epoch 1, Batch 1700, Loss: 0.4786
Epoch 1, Batch 1800, Loss: 0.7643
Epoch 1, Batch 1900, Loss: 0.5688
Epoch 1, Batch 2000, Loss: 0.6385
Epoch 1, Batch 2100, Loss: 0.6722
Epoch 1, Batch 2200, Loss: 0.8248
Epoch 1, Batch 2300, Loss: 0.5598
Epoch 1, Batch 2400, Loss: 1.0153
Epoch 1, Batch 2500, Loss: 0.8021
Epoch 1, Batch 2600, Loss: 0.5680
Epoch 1, Batch 2700, Loss: 0.6009
Epoch 1, Batch 2800, Loss: 0.5620
Epoch 1, Batch 2900, Loss:

In [None]:
# ============================================================
# VALIDASI - cek QWK sebelum submit
# ============================================================
from sklearn.metrics import cohen_kappa_score

# Buat validation set dari train
val_size = 5000
val_sample = train_df[~train_df.index.isin(train_sample.index)].sample(n=val_size, random_state=42)
val_texts = val_sample['text'].fillna('').tolist()
val_labels = val_sample['stars'].astype(int).tolist()

def evaluate_model(model, texts, labels, batch_size=32):
    model.eval()
    all_preds = []

    dataset = YelpDataset(texts, labels)
    loader = DataLoader(dataset, batch_size=batch_size)

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1) + 1  # +1 back to 1-5
            all_preds.extend(preds.cpu().numpy())

    qwk = cohen_kappa_score(labels, all_preds, weights='quadratic')
    print(f"QWK Score: {qwk:.4f}")

    # Lihat distribusi prediksi
    print("\nDistribusi Prediksi:")
    print(pd.Series(all_preds).value_counts().sort_index())
    print("\nDistribusi Aktual:")
    print(pd.Series(labels).value_counts().sort_index())

    return qwk, all_preds

qwk_score, val_preds = evaluate_model(model, val_texts, val_labels)

QWK Score: 0.9002

Distribusi Prediksi:
1     832
2     367
3     329
4     851
5    2621
Name: count, dtype: int64

Distribusi Aktual:
1     737
2     406
3     475
4    1041
5    2341
Name: count, dtype: int64


In [None]:
# ============================================================
# SIMPAN MODEL (penting! jangan sampai hilang)
# ============================================================
model.save_pretrained('/kaggle/working/distilbert-yelp')
tokenizer.save_pretrained('/kaggle/working/distilbert-yelp')
print("Model tersimpan!")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model tersimpan!


In [None]:
# ============================================================
# PREDIKSI TEST SET & BUAT SUBMISSION
# ============================================================
def predict_test(model, test_texts, batch_size=32):
    model.eval()
    all_preds = []

    dataset = YelpDataset(test_texts)  # tanpa labels
    loader = DataLoader(dataset, batch_size=batch_size)

    with torch.no_grad():
        for i, batch in enumerate(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1) + 1
            all_preds.extend(preds.cpu().numpy())

            if i % 500 == 0:
                print(f"Progress: {i*batch_size}/{len(test_texts)}")

    return all_preds

test_texts = test_df['text'].fillna('').tolist()
test_preds = predict_test(model, test_texts)

Progress: 0/1398125
Progress: 16000/1398125
Progress: 32000/1398125
Progress: 48000/1398125
Progress: 64000/1398125
Progress: 80000/1398125
Progress: 96000/1398125


In [None]:
submission = pd.DataFrame({
    'review_id': test_df['review_id'],
    'stars': test_preds
})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission siap!")
print(submission['stars'].value_counts().sort_index())