In [None]:
# ========================
# 0. Install libraries
# ========================
!pit install --quiet numpy spacy thinc
!pip install --quiet torch torchvision torchaudio
!pip install --quiet transformers fugashi ipadic accelerate peft sentencepiece matplotlib seaborn tqdm

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: False


In [None]:
import transformers
print(transformers.__version__)

4.52.4


In [None]:
!pip install certifi
!mkdir -p /usr/local/share/ca-certificates/
!cp /etc/ssl/certs/ca-certificates.crt /usr/local/share/ca-certificates/
!update-ca-certificates


Updating certificates in /etc/ssl/certs...
1 added, 0 removed; done.
Running hooks in /etc/ca-certificates/update.d...

Adding debian:ca-certificates.pem
done.
done.


In [None]:
# ========================
# 1. Imports & Setup
# ========================
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import optuna
import shap
import matplotlib.pyplot as plt
from ace_tools_open import display_dataframe_to_user

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========================
# 2. Kansai-ben & Directness Detection
# ========================
kansaiben_keywords = ["〜やん", "〜やで", "〜せなあかん", "ちゃう", "ほんま", "めっちゃ", "〜せんと", "なんでやねん"]
def detect_kansaiben(text):
    return any(k in text for k in kansaiben_keywords)

def detect_directness(text):
    direct_phrases = ["最悪", "ありえない", "めっちゃ", "だめ", "良い", "良くない", "おすすめ", "絶対", "微妙"]
    return any(word in text for word in direct_phrases)

# ========================
# 3. Load & Prepare Data
# ========================
def load_jsts_json(url):
    df = pd.read_json(url, lines=True)
    df['text'] = df['sentence1'] + " " + df['sentence2']
    df['sentiment'] = df['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    return df[['text', 'sentiment']]

df_train = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/train-v1.3.json").sample(500, random_state=42)
df_valid = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/valid-v1.3.json").sample(100, random_state=42)

model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=128, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = tokenize_batch(df['text'])
        self.labels = torch.tensor(df['sentiment'].values)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_ds = SimpleDataset(df_train)
eval_ds = SimpleDataset(df_valid)

# ========================
# 4. LoRA Model Init & Quick Finetune (for demonstration)
# ========================
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(base_model, peft_config).to(device)

from torch.utils.data import DataLoader
from torch.optim import AdamW

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc="Training")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# ========================
# 5. Extract Transformer [CLS] Embeddings (Validation Set)
# ========================
def extract_cls_embeddings(model, texts):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting embeddings"):
            inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
            outputs = model.base_model(**{k: v for k, v in inputs.items()})
            emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(emb)
    return np.vstack(embeddings)

valid_embeddings = extract_cls_embeddings(model, df_valid['text'])

# ========================
# 6. Optuna Hyperparameter Tuning for XGBoost
# ========================
le = LabelEncoder()
y_valid = le.fit_transform(df_valid['sentiment'])

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "verbosity": 0,
    }
    clf = XGBClassifier(**params)
    clf.fit(valid_embeddings, y_valid)
    preds = clf.predict(valid_embeddings)
    accuracy = np.mean(preds == y_valid)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print("Best trial:", study.best_trial.params)

# Use best params
clf = XGBClassifier(**study.best_trial.params)
clf.fit(valid_embeddings, y_valid)
df_valid['xgb_pred'] = le.inverse_transform(clf.predict(valid_embeddings))

print("\nClassification Report (XGBoost + Optuna):")
print(classification_report(df_valid['sentiment'], df_valid['xgb_pred']))

# ========================
# 7. XGBoost Feature Importance
# ========================
plt.figure(figsize=(10,4))
plt.bar(range(valid_embeddings.shape[1]), clf.feature_importances_)
plt.title('XGBoost Feature Importance')
plt.xlabel('Embedding Dimension')
plt.ylabel('Importance')
plt.show()

# ========================
# 8. SHAP Analysis for XGBoost
# ========================
explainer = shap.Explainer(clf, valid_embeddings)
shap_values = explainer(valid_embeddings)

shap.summary_plot(shap_values, valid_embeddings, show=True, plot_type="bar", max_display=20)

# ========================
# 9. Kansai-ben & Direct Tone Columns
# ========================
df_valid['kansai_ben'] = df_valid['text'].apply(detect_kansaiben)
df_valid['direct_tone'] = df_valid['text'].apply(detect_directness)

# ========================
# 10. Display Results
# ========================
display_dataframe_to_user(name="JGLUE Sentiment + Kansai-ben Analysis", dataframe=df_valid)
