In [12]:
# ========================
# 0. Install libraries
# ========================
!pip install --quiet numpy spacy thinc
!pip install --quiet torch torchvision torchaudio
!pip install --quiet transformers fugashi ipadic accelerate peft sentencepiece matplotlib seaborn tqdm
!pip install --quiet xgboost optuna ace_tools_open shap unidic-lite mecab-python3

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.8/588.8 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone


In [13]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [14]:
import transformers
print(transformers.__version__)

4.52.4


In [15]:
!pip install certifi
!mkdir -p /usr/local/share/ca-certificates/
!cp /etc/ssl/certs/ca-certificates.crt /usr/local/share/ca-certificates/
!update-ca-certificates


Updating certificates in /etc/ssl/certs...
0 added, 0 removed; done.
Running hooks in /etc/ca-certificates/update.d...

done.
done.


In [None]:
# ========================
# 1. Imports & Setup
# ========================
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from peft import get_peft_model, LoraConfig, TaskType
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, label_binarize
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from ace_tools_open import display_dataframe_to_user
except ImportError:
    def display_dataframe_to_user(*args, **kwargs):
        print("ace_tools not installed; displaying DataFrame head:")
        print(args[1].head())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========================
# 2. Kansai-ben & Directness Detection
# ========================
kansaiben_keywords = ["〜やん", "〜やで", "〜せなあかん", "ちゃう", "ほんま", "めっちゃ", "〜せんと", "なんでやねん"]
def detect_kansaiben(text):
    return any(k in text for k in kansaiben_keywords)

def detect_directness(text):
    direct_phrases = ["最悪", "ありえない", "めっちゃ", "だめ", "良い", "良くない", "おすすめ", "絶対", "微妙"]
    return any(word in text for word in direct_phrases)

# ========================
# 3. Load & Prepare Data
# ========================
def load_jsts_json(url):
    df = pd.read_json(url, lines=True)
    df['text'] = df['sentence1'] + " " + df['sentence2']
    df['sentiment'] = df['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    return df[['text', 'sentiment']]

df_train = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/train-v1.3.json").sample(5000, random_state=42)
df_valid = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/valid-v1.3.json").sample(500, random_state=42)
df_test = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/test-v1.3.json").sample(300, random_state=42)

model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=128, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = tokenize_batch(df['text'])
        self.labels = torch.tensor(df['sentiment'].values)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_ds = SimpleDataset(df_train)
valid_ds = SimpleDataset(df_valid)
test_ds = SimpleDataset(df_test)

# ========================
# 4. LoRA Model Init & Quick Finetune (for demonstration)
# ========================
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(base_model, peft_config).to(device)

from torch.utils.data import DataLoader
from torch.optim import AdamW

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc="Training")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# ========================
# 5. Extract Transformer [CLS] Embeddings (All Sets)
# ========================
bert_encoder = AutoModel.from_pretrained(model_name).to(device)
bert_encoder.eval()

def extract_cls_embeddings(encoder, texts, tokenizer, device):
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting embeddings"):
            inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
            outputs = encoder(**{k: v for k, v in inputs.items()})
            cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_emb)
    return np.vstack(embeddings)

valid_embeddings = extract_cls_embeddings(bert_encoder, df_valid['text'], tokenizer, device)
test_embeddings = extract_cls_embeddings(bert_encoder, df_test['text'], tokenizer, device)

le = LabelEncoder()
y_valid = le.fit_transform(df_valid['sentiment'])
y_test = le.transform(df_test['sentiment'])

# ========================
# 6. Add Classical Features to Test Set
# ========================
df_test['length'] = df_test['text'].apply(len)
df_test['kansai_ben'] = df_test['text'].apply(detect_kansaiben).astype(int)
df_test['direct_tone'] = df_test['text'].apply(detect_directness).astype(int)
classic_feats_test = df_test[['length', 'kansai_ben', 'direct_tone']].values
combined_test_features = np.hstack([test_embeddings, classic_feats_test])

# Also add to valid set for tuning
df_valid['length'] = df_valid['text'].apply(len)
df_valid['kansai_ben'] = df_valid['text'].apply(detect_kansaiben).astype(int)
df_valid['direct_tone'] = df_valid['text'].apply(detect_directness).astype(int)
classic_feats_valid = df_valid[['length', 'kansai_ben', 'direct_tone']].values
combined_valid_features = np.hstack([valid_embeddings, classic_feats_valid])

# ========================
# 7. Optuna + K-Fold CV for XGBoost (validation only, with classic features)
# ========================
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 2.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 2.0),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "verbosity": 0,
        "tree_method": "gpu_hist",
    }
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in skf.split(combined_valid_features, y_valid):
        X_tr, X_va = combined_valid_features[train_idx], combined_valid_features[valid_idx]
        y_tr, y_va = y_valid[train_idx], y_valid[valid_idx]
        clf = XGBClassifier(**params)
        clf.fit(X_tr, y_tr)
        preds = clf.predict(X_va)
        score = np.mean(preds == y_va)
        scores.append(score)
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print("Best trial:", study.best_trial.params)

# ========================
# 8. Fit Final XGBoost on Validation, Evaluate on Test (with classic features)
# ========================
feat_names = [f'CLS_emb_{i}' for i in range(test_embeddings.shape[1])] + ['length', 'kansai_ben', 'direct_tone']
clf = XGBClassifier(**study.best_trial.params)
clf.fit(combined_valid_features, y_valid)
test_pred = clf.predict(combined_test_features)
df_test['xgb_pred'] = le.inverse_transform(test_pred)
test_pred_proba = clf.predict_proba(combined_test_features)

print("\nClassification Report (XGBoost + Optuna, Test Set):")
print(classification_report(df_test['sentiment'], df_test['xgb_pred']))

# ========================
# 9. Confusion Matrix (Test)
# ========================
plt.figure(figsize=(6,5))
cm = confusion_matrix(df_test['sentiment'], df_test['xgb_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Test Set)")
plt.show()

# ========================
# 10. AUC-ROC Curve (Test, One-vs-Rest)
# ========================
y_test_bin = label_binarize(df_test['sentiment'], classes=[0,1,2])
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_test_bin.shape[1]
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(7,5))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"Class {le.classes_[i]} (AUC = {roc_auc[i]:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve (Test Set, OvR)")
plt.legend()
plt.show()

# ========================
# 11. Feature Importance & SHAP (Test) with feature names
# ========================
plt.figure(figsize=(14,4))
imp = clf.feature_importances_
sorted_idx = np.argsort(imp)[::-1][:30] # Show top 30 features
plt.bar(np.array(feat_names)[sorted_idx], imp[sorted_idx])
plt.title("Feature Importance (Top 30, Test Set)")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

explainer = shap.Explainer(clf, combined_test_features)
shap_values = explainer(combined_test_features)
shap.summary_plot(shap_values, combined_test_features, feature_names=feat_names, plot_type="bar", max_display=30, show=True)

# ========================
# 12. Display Results (Test)
# ========================
display_dataframe_to_user(name="JGLUE Sentiment + Kansai-ben Analysis (Test Set)", dataframe=df_test)


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 625/625 [01:10<00:00,  8.90it/s, loss=0.985]
Extracting embeddings: 100%|██████████| 500/500 [00:05<00:00, 99.06it/s]
Extracting embeddings: 100%|██████████| 300/300 [00:03<00:00, 98.71it/s]
[I 2025-06-13 13:35:48,846] A new study created in memory with name: no-name-5fca85c5-503e-4534-b614-d913730aa773
[I 2025-06-13 13:36:00,592] Trial 0 finished with value: 0.61 and parameters: {'n_estimators': 134, 'max_depth': 5, 'learning_rate': 0.025447318044836038, 'subsample': 0.7466105283296202, 'colsample_bytree': 0.5220742126422975, 'gamma': 4.006416053059103, 'reg_alpha': 0.11135123976281402, 'reg_lambda': 1.3594915915371013}. Best is trial 0 with value: 0.61.
[I