In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: False


In [None]:
import transformers
print(transformers.__version__)

4.52.4


In [None]:
!pip install certifi
!mkdir -p /usr/local/share/ca-certificates/
!cp /etc/ssl/certs/ca-certificates.crt /usr/local/share/ca-certificates/
!update-ca-certificates


Updating certificates in /etc/ssl/certs...
1 added, 0 removed; done.
Running hooks in /etc/ca-certificates/update.d...

Adding debian:ca-certificates.pem
done.
done.


In [None]:
# ========================
# 1. Imports & Setup
# ========================
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========================
# 2. Kansai-ben & Directness Detection
# ========================
kansaiben_keywords = ["〜やん", "〜やで", "〜せなあかん", "ちゃう", "ほんま", "めっちゃ", "〜せんと", "なんでやねん"]
def detect_kansaiben(text):
    return any(k in text for k in kansaiben_keywords)

# Heuristic for "direct tone" detection
def detect_directness(text):
    # Phrases common in direct or strong opinions
    direct_phrases = ["最悪", "ありえない", "めっちゃ", "だめ", "良い", "良くない", "おすすめ", "絶対", "微妙"]
    return any(word in text for word in direct_phrases)

# ========================
# 3. Load JGLUE Dataset
# ========================
def load_jsts_json(url):
    df = pd.read_json(url, lines=True)
    df['text'] = df['sentence1'] + " " + df['sentence2']
    df['sentiment'] = df['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    return df[['text', 'sentiment']]

df_train = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/train-v1.3.json").sample(500, random_state=42)
df_valid = load_jsts_json("https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/valid-v1.3.json").sample(100, random_state=42)

# ========================
# 4. Tokenization & Dataset
# ========================
model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=128, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = tokenize_batch(df['text'])
        self.labels = torch.tensor(df['sentiment'].values)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_ds = SimpleDataset(df_train)
eval_ds = SimpleDataset(df_valid)

# ========================
# 5. LoRA Model Init
# ========================
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(base_model, peft_config).to(device)

# ========================
# 6. Custom Training Loop with tqdm
# ========================
from torch.utils.data import DataLoader
from torch.optim import AdamW

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

model.train()
loss_history = []

for epoch in range(1):  # 1 epoch
    loop = tqdm(train_loader, desc="Training")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loss_history.append(loss.item())
        loop.set_postfix(loss=loss.item())

# ========================
# 7. Inference Function
# ========================
def predict_sentiment(text):
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[probs.argmax().item()]

# ========================
# 8. Merge & Analyze
# ========================
df_amz = pd.DataFrame({'content': df_train['text'][:30]})
df_amz['source'] = 'JGLUE'
df_amz['predicted_sentiment'] = df_amz['content'].apply(predict_sentiment)
df_amz['kansai_ben'] = df_amz['content'].apply(detect_kansaiben)
df_amz['direct_tone'] = df_amz['content'].apply(detect_directness)

# Add mock external sources
df_x = pd.DataFrame({'content': ["これは良い商品やで", "最悪やんけ！"], 'source': ['X', 'X']})
df_fb = pd.DataFrame({'content': ["本当に素晴らしい", "ちょっとちゃうねん"], 'source': ['Facebook', 'Facebook']})
for df in [df_x, df_fb]:
    df['predicted_sentiment'] = df['content'].apply(predict_sentiment)
    df['kansai_ben'] = df['content'].apply(detect_kansaiben)
    df['direct_tone'] = df['content'].apply(detect_directness)

df_all = pd.concat([df_amz, df_x, df_fb], ignore_index=True)
print("\nSample Results:\n", df_all.head(10))

# ========================
# 9. Visualization
# ========================
plt.figure(figsize=(8, 4))
sns.countplot(data=df_all, x='predicted_sentiment', hue='source')
plt.title("Sentiment Distribution by Source")
plt.tight_layout()
plt.show()

plt.figure(figsize=(7, 3))
sns.countplot(data=df_all, x='direct_tone', hue='source')
plt.title("Direct Tone Presence by Source")
plt.tight_layout()
plt.show()

plt.figure(figsize=(7, 3))
sns.countplot(data=df_all, x='kansai_ben', hue='source')
plt.title("Kansai-ben Detection by Source")
plt.tight_layout()
plt.show()


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/517 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/236k [00:00<?, ?B/s]

ModuleNotFoundError: You need to install fugashi to use MecabTokenizer. See https://pypi.org/project/fugashi/ for installation.

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import optuna
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from ace_tools_open import display_dataframe_to_user

# Use the existing validated dataframe (df_valid from earlier)
df_valid_sample = df_valid.copy()
df_valid_sample['predicted'] = df_valid_sample['text'].apply(predict_sentiment)
df_valid_sample['kansai_ben'] = df_valid_sample['text'].apply(detect_kansaiben)
df_valid_sample['direct_tone'] = df_valid_sample['text'].apply(detect_directness)

# Encode labels
le = LabelEncoder()
df_valid_sample['sentiment_label'] = le.fit_transform(df_valid_sample['sentiment'])
df_valid_sample['predicted_label'] = le.transform([["Negative", "Neutral", "Positive"].index(x) for x in df_valid_sample['predicted']])

# Define Optuna objective for XGBoost
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    }

    X_train, X_test, y_train, y_test = train_test_split(
        df_valid_sample['text'], df_valid_sample['sentiment_label'], test_size=0.2, random_state=42
    )

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=3000)),
        ('clf', XGBClassifier(**params))
    ])

    pipeline.fit(X_train, y_train)
    return pipeline.score(X_test, y_test)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

# Train final model with best parameters
best_params = study.best_params
best_params["use_label_encoder"] = False
best_params["eval_metric"] = "mlogloss"
final_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=3000)),
    ('clf', XGBClassifier(**best_params))
])
final_model.fit(df_valid_sample['text'], df_valid_sample['sentiment_label'])

# Predict and evaluate
df_valid_sample['xgb_pred'] = final_model.predict(df_valid_sample['text'])
report = classification_report(df_valid_sample['sentiment_label'], df_valid_sample['xgb_pred'], target_names=le.classes_, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# Display results
display_dataframe_to_user(name="JGLUE Sentiment Validation with XGBoost", dataframe=df_valid_sample)
report_df.head()
