In [None]:
# ========================
# 0. Install libraries
# ========================
!pip install --quiet numpy spacy thinc
!pip install --quiet torch torchvision torchaudio
!pip install --quiet transformers fugashi ipadic accelerate peft sentencepiece matplotlib seaborn tqdm
!pip install --quiet xgboost optuna ace_tools_open shap unidic-lite mecab-python3

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

In [None]:
import transformers
print(transformers.__version__)

In [None]:
!pip install certifi
!mkdir -p /usr/local/share/ca-certificates/
!cp /etc/ssl/certs/ca-certificates.crt /usr/local/share/ca-certificates/
!update-ca-certificates


In [None]:
# ========================
# 1. Imports & Setup
# ========================
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from peft import get_peft_model, LoraConfig, TaskType
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, label_binarize
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from ace_tools_open import display_dataframe_to_user
except ImportError:
    def display_dataframe_to_user(*args, **kwargs):
        print("ace_tools not installed; displaying DataFrame head:")
        print(args[1].head())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========================
# 2. Kansai-ben & Directness Detection
# ========================
kansaiben_keywords = ["〜やん", "〜やで", "〜せなあかん", "ちゃう", "ほんま", "めっちゃ", "〜せんと", "なんでやねん"]
def detect_kansaiben(text):
    return any(k in text for k in kansaiben_keywords)

def detect_directness(text):
    direct_phrases = ["最悪", "ありえない", "めっちゃ", "だめ", "良い", "良くない", "おすすめ", "絶対", "微妙"]
    return any(word in text for word in direct_phrases)

# ========================
# 3. Load & Prepare Data (CHUNKED)
# ========================
def load_jsts_json(url):
    df = pd.read_json(url, lines=True)
    df['text'] = df['sentence1'] + " " + df['sentence2']
    df['sentiment'] = df['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    return df[['text', 'sentiment']]

train_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/train-v1.3.json"
valid_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/valid-v1.3.json"
test_url  = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/test-v1.3.json"

chunk_size = 800   # For low GPU RAM; adjust up if you have more memory

df_valid = load_jsts_json(valid_url).sample(500, random_state=42)
df_test = load_jsts_json(test_url).sample(100, random_state=42)

model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=128, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = tokenize_batch(df['text'])
        self.labels = torch.tensor(df['sentiment'].values)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

# ========================
# 4. LoRA Model Init & Batch Finetune (demonstration)
# ========================
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(base_model, peft_config).to(device)

from torch.utils.data import DataLoader
from torch.optim import AdamW

for i, df_chunk in enumerate(pd.read_json(train_url, lines=True, chunksize=chunk_size)):
    df_chunk = df_chunk.sample(frac=1, random_state=42+i).reset_index(drop=True)
    df_chunk['text'] = df_chunk['sentence1'] + " " + df_chunk['sentence2']
    df_chunk['sentiment'] = df_chunk['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    df_chunk = df_chunk[['text', 'sentiment']]
    train_ds = SimpleDataset(df_chunk)
    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(1):  # For demonstration, 1 epoch per chunk
        loop = tqdm(train_loader, desc=f"Training chunk {i+1}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loop.set_postfix(loss=loss.item())

# ========================
# 5. Extract Transformer [CLS] Embeddings (All Sets, in Batches)
# ========================
bert_encoder = AutoModel.from_pretrained(model_name).to(device)
bert_encoder.eval()

def extract_cls_embeddings_batched(encoder, texts, tokenizer, device, batch_size=32):
    embeddings = []
    n = len(texts)
    for i in tqdm(range(0, n, batch_size), desc="Extracting embeddings"):
        batch_texts = texts.iloc[i:i+batch_size]
        inputs = tokenizer(list(batch_texts), return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
        with torch.no_grad():
            outputs = encoder(**inputs)
        batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_emb)
    return np.vstack(embeddings)

valid_embeddings = extract_cls_embeddings_batched(bert_encoder, df_valid['text'], tokenizer, device, batch_size=32)
test_embeddings = extract_cls_embeddings_batched(bert_encoder, df_test['text'], tokenizer, device, batch_size=32)

le = LabelEncoder()
y_valid = le.fit_transform(df_valid['sentiment'])
y_test = le.transform(df_test['sentiment'])

# ========================
# 6. Add Classical Features to Test Set
# ========================
df_test['length'] = df_test['text'].apply(len)
df_test['kansai_ben'] = df_test['text'].apply(detect_kansaiben).astype(int)
df_test['direct_tone'] = df_test['text'].apply(detect_directness).astype(int)
classic_feats_test = df_test[['length', 'kansai_ben', 'direct_tone']].values
combined_test_features = np.hstack([test_embeddings, classic_feats_test])

df_valid['length'] = df_valid['text'].apply(len)
df_valid['kansai_ben'] = df_valid['text'].apply(detect_kansaiben).astype(int)
df_valid['direct_tone'] = df_valid['text'].apply(detect_directness).astype(int)
classic_feats_valid = df_valid[['length', 'kansai_ben', 'direct_tone']].values
combined_valid_features = np.hstack([valid_embeddings, classic_feats_valid])

# ========================
# 7. Optuna + K-Fold CV for XGBoost (validation only, with classic features)
# ========================
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 2),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 0.5),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "verbosity": 0,
        "tree_method": "gpu_hist",
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in skf.split(combined_valid_features, y_valid):
        X_tr, X_va = combined_valid_features[train_idx], combined_valid_features[valid_idx]
        y_tr, y_va = y_valid[train_idx], y_valid[valid_idx]
        clf = XGBClassifier(**params)
        clf.fit(X_tr, y_tr)
        preds = clf.predict(X_va)
        score = np.mean(preds == y_va)
        scores.append(score)
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)
print("Best trial:", study.best_trial.params)

# ========================
# 8. Fit Final XGBoost on Validation, Evaluate on Test (with classic features)
# ========================
feat_names = np.array([f'CLS_emb_{i}' for i in range(test_embeddings.shape[1])] + ['length', 'kansai_ben', 'direct_tone'])
clf = XGBClassifier(**study.best_trial.params)
clf.fit(combined_valid_features, y_valid)
test_pred = clf.predict(combined_test_features)
df_test['xgb_pred'] = le.inverse_transform(test_pred)
test_pred_proba = clf.predict_proba(combined_test_features)

print("\nClassification Report (XGBoost + Optuna, Test Set):")
print(classification_report(df_test['sentiment'], df_test['xgb_pred']))

# ========================
# 9. Confusion Matrix (Test)
# ========================
plt.figure(figsize=(6,5))
cm = confusion_matrix(df_test['sentiment'], df_test['xgb_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Test Set)")
plt.show()

# ========================
# 10. AUC-ROC Curve (Test, One-vs-Rest)
# ========================
y_test_bin = label_binarize(df_test['sentiment'], classes=[0,1,2])
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_test_bin.shape[1]
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(7,5))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"Class {le.classes_[i]} (AUC = {roc_auc[i]:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve (Test Set, OvR)")
plt.legend()
plt.show()

# ========================
# 11. Stakeholder-Ready Feature Importance & SHAP (FIXED)
# ========================
explainer = shap.Explainer(clf, combined_test_features)
shap_values = explainer(combined_test_features)

# --- Stakeholder Bar Plot: Only classic features (last three) ---
classic_idxs = [-3, -2, -1]  # length, kansai_ben, direct_tone
classic_names = feat_names[classic_idxs]

# FIX: For multiclass, SHAP values are 3D (n_samples, n_features, n_classes)
# We need to take mean across samples AND classes to get feature importance
if len(shap_values.values.shape) == 3:
    # Multiclass case: take mean across samples (axis=0) and classes (axis=2)
    classic_importance = np.abs(shap_values.values).mean(axis=(0, 2))[classic_idxs]
else:
    # Binary case: take mean across samples only
    classic_importance = np.abs(shap_values.values).mean(axis=0)[classic_idxs]

classic_importance = np.array(classic_importance, dtype=float).flatten()
y_pos = np.arange(len(classic_names))

# Ensure we have enough colors
color_list = ['#62b5e5', '#a2d4ab', '#fa7268']
colors = (color_list * ((len(classic_names)+2)//3))[:len(classic_names)]

print(f"Debug: classic_names shape: {classic_names.shape}")
print(f"Debug: classic_importance shape: {classic_importance.shape}")
print(f"Debug: colors length: {len(colors)}")

plt.figure(figsize=(7,2))
plt.barh(y_pos, classic_importance, color=colors)
plt.yticks(y_pos, classic_names)
plt.xlabel("Mean absolute SHAP value")
plt.title("Top Interpretability Features (Stakeholder View)")
plt.tight_layout()
plt.show()

# --- Table for slides/exports ---
df_shap = pd.DataFrame({
    "Feature": classic_names,
    "Mean_abs_SHAP": classic_importance
})
print("\nFeature Importance Summary:")
print(df_shap)

# --- Optional: SHAP Waterfall for one test prediction (explains an example)
try:
    shap.plots.waterfall(shap_values[0], max_display=5, feature_names=feat_names)
except Exception as e:
    print(f"Waterfall plot failed: {e}")
    print("This is common with multiclass SHAP - you can use summary plots instead")

# Alternative: SHAP Summary Plot (works better with multiclass)
plt.figure(figsize=(8, 6))
shap.summary_plot(shap_values.values[:, classic_idxs], 
                  combined_test_features[:, classic_idxs], 
                  feature_names=classic_names, 
                  show=False)
plt.title("SHAP Summary Plot - Classic Features")
plt.tight_layout()
plt.show()

# ========================
# 12. Display Results (Test)
# ========================
display_dataframe_to_user(name="JGLUE Sentiment + Kansai-ben Analysis (Test Set)", dataframe=df_test)

In [None]:
# ========================
# 1. Imports & Setup
# ========================
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from peft import get_peft_model, LoraConfig, TaskType
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, label_binarize
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from ace_tools_open import display_dataframe_to_user
except ImportError:
    def display_dataframe_to_user(*args, **kwargs):
        print("ace_tools not installed; displaying DataFrame head:")
        print(args[1].head())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ========================
# 2. Kansai-ben & Directness Detection
# ========================
kansaiben_keywords = ["〜やん", "〜やで", "〜せなあかん", "ちゃう", "ほんま", "めっちゃ", "〜せんと", "なんでやねん"]
def detect_kansaiben(text):
    return any(k in text for k in kansaiben_keywords)

def detect_directness(text):
    direct_phrases = ["最悪", "ありえない", "めっちゃ", "だめ", "良い", "良くない", "おすすめ", "絶対", "微妙"]
    return any(word in text for word in direct_phrases)

# ========================
# 3. Load & Prepare Data (CHUNKED)
# ========================
def load_jsts_json(url):
    df = pd.read_json(url, lines=True)
    df['text'] = df['sentence1'] + " " + df['sentence2']
    df['sentiment'] = df['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    return df[['text', 'sentiment']]

train_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/train-v1.3.json"
valid_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/valid-v1.3.json"
test_url  = "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/heads/main/datasets/jsts-v1.3/test-v1.3.json"

chunk_size = 800   # For low GPU RAM; adjust up if you have more memory

df_valid = load_jsts_json(valid_url).sample(500, random_state=42)
df_test = load_jsts_json(test_url).sample(100, random_state=42)

model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=128, return_tensors='pt')

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = tokenize_batch(df['text'])
        self.labels = torch.tensor(df['sentiment'].values)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

# ========================
# 4. LoRA Model Init & Batch Finetune (demonstration)
# ========================
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(base_model, peft_config).to(device)

from torch.utils.data import DataLoader
from torch.optim import AdamW

for i, df_chunk in enumerate(pd.read_json(train_url, lines=True, chunksize=chunk_size)):
    df_chunk = df_chunk.sample(frac=1, random_state=42+i).reset_index(drop=True)
    df_chunk['text'] = df_chunk['sentence1'] + " " + df_chunk['sentence2']
    df_chunk['sentiment'] = df_chunk['label'].apply(lambda x: 0 if x < 2 else (1 if x <= 3 else 2))
    df_chunk = df_chunk[['text', 'sentiment']]
    train_ds = SimpleDataset(df_chunk)
    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(1):  # For demonstration, 1 epoch per chunk
        loop = tqdm(train_loader, desc=f"Training chunk {i+1}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loop.set_postfix(loss=loss.item())

# ========================
# 5. Extract Transformer [CLS] Embeddings (All Sets, in Batches)
# ========================
bert_encoder = AutoModel.from_pretrained(model_name).to(device)
bert_encoder.eval()

def extract_cls_embeddings_batched(encoder, texts, tokenizer, device, batch_size=32):
    embeddings = []
    n = len(texts)
    for i in tqdm(range(0, n, batch_size), desc="Extracting embeddings"):
        batch_texts = texts.iloc[i:i+batch_size]
        inputs = tokenizer(list(batch_texts), return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
        with torch.no_grad():
            outputs = encoder(**inputs)
        batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_emb)
    return np.vstack(embeddings)

valid_embeddings = extract_cls_embeddings_batched(bert_encoder, df_valid['text'], tokenizer, device, batch_size=32)
test_embeddings = extract_cls_embeddings_batched(bert_encoder, df_test['text'], tokenizer, device, batch_size=32)

le = LabelEncoder()
y_valid = le.fit_transform(df_valid['sentiment'])
y_test = le.transform(df_test['sentiment'])

# ========================
# 6. Add Classical Features to Test Set
# ========================
df_test['length'] = df_test['text'].apply(len)
df_test['kansai_ben'] = df_test['text'].apply(detect_kansaiben).astype(int)
df_test['direct_tone'] = df_test['text'].apply(detect_directness).astype(int)
classic_feats_test = df_test[['length', 'kansai_ben', 'direct_tone']].values
combined_test_features = np.hstack([test_embeddings, classic_feats_test])

df_valid['length'] = df_valid['text'].apply(len)
df_valid['kansai_ben'] = df_valid['text'].apply(detect_kansaiben).astype(int)
df_valid['direct_tone'] = df_valid['text'].apply(detect_directness).astype(int)
classic_feats_valid = df_valid[['length', 'kansai_ben', 'direct_tone']].values
combined_valid_features = np.hstack([valid_embeddings, classic_feats_valid])

# ========================
# 7. Optuna + K-Fold CV for XGBoost (validation only, with classic features)
# ========================
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 2),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 0.5),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "verbosity": 0,
        "tree_method": "gpu_hist",
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in skf.split(combined_valid_features, y_valid):
        X_tr, X_va = combined_valid_features[train_idx], combined_valid_features[valid_idx]
        y_tr, y_va = y_valid[train_idx], y_valid[valid_idx]
        clf = XGBClassifier(**params)
        clf.fit(X_tr, y_tr)
        preds = clf.predict(X_va)
        score = np.mean(preds == y_va)
        scores.append(score)
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)
print("Best trial:", study.best_trial.params)

# ========================
# 8. Fit Final XGBoost on Validation, Evaluate on Test (with classic features)
# ========================
feat_names = np.array([f'CLS_emb_{i}' for i in range(test_embeddings.shape[1])] + ['length', 'kansai_ben', 'direct_tone'])
clf = XGBClassifier(**study.best_trial.params)
clf.fit(combined_valid_features, y_valid)
test_pred = clf.predict(combined_test_features)
df_test['xgb_pred'] = le.inverse_transform(test_pred)
test_pred_proba = clf.predict_proba(combined_test_features)

print("\nClassification Report (XGBoost + Optuna, Test Set):")
print(classification_report(df_test['sentiment'], df_test['xgb_pred']))

# ========================
# 9. Confusion Matrix (Test)
# ========================
plt.figure(figsize=(6,5))
cm = confusion_matrix(df_test['sentiment'], df_test['xgb_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Test Set)")
plt.show()

# ========================
# 10. AUC-ROC Curve (Test, One-vs-Rest)
# ========================
y_test_bin = label_binarize(df_test['sentiment'], classes=[0,1,2])
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_test_bin.shape[1]
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], test_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(7,5))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"Class {le.classes_[i]} (AUC = {roc_auc[i]:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve (Test Set, OvR)")
plt.legend()
plt.show()

# ========================
# 11. Stakeholder-Friendly Feature Importance Analysis
# ========================

# Get feature importance from XGBoost directly (simpler alternative to SHAP)
feature_importance_xgb = clf.feature_importances_
classic_idxs = [-3, -2, -1]  # length, kansai_ben, direct_tone
classic_names = feat_names[classic_idxs]
classic_importance = feature_importance_xgb[classic_idxs]

# Alternative: Use SHAP if you want more sophisticated explanations
# explainer = shap.Explainer(clf, combined_test_features)
# shap_values = explainer(combined_test_features)
# if len(shap_values.values.shape) == 3:
#     classic_importance = np.abs(shap_values.values).mean(axis=(0, 2))[classic_idxs]
# else:
#     classic_importance = np.abs(shap_values.values).mean(axis=0)[classic_idxs]

# ========================
# STAKEHOLDER-FRIENDLY VISUALIZATIONS
# ========================

def create_business_impact_chart(classic_names, classic_importance):
    """Clean, professional chart showing business impact of each feature"""
    business_labels = {
        'length': 'Text Length',
        'kansai_ben': 'Regional Dialect\n(Kansai-ben)',
        'direct_tone': 'Direct Expression\nStyle'
    }
    
    readable_names = [business_labels.get(name, name) for name in classic_names]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = ['#2E86AB', '#A23B72', '#F18F01']
    
    bars = ax.barh(readable_names, classic_importance, color=colors, height=0.6)
    
    # Add value labels on bars
    for i, (bar, value) in enumerate(zip(bars, classic_importance)):
        ax.text(value + max(classic_importance)*0.01, bar.get_y() + bar.get_height()/2, 
                f'{value:.3f}', ha='left', va='center', fontweight='bold', fontsize=11)
    
    ax.set_xlabel('Feature Importance Score', fontsize=12, fontweight='bold')
    ax.set_title('Key Factors Influencing Sentiment Classification', 
                 fontsize=14, fontweight='bold', pad=20)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_axisbelow(True)
    ax.grid(axis='x', alpha=0.3, linestyle='--')
    
    plt.tight_layout()
    plt.show()
    return fig

def create_executive_summary_table(classic_names, classic_importance):
    """Professional table with business insights"""
    business_context = {
        'length': {
            'description': 'Length of customer feedback',
            'insight': 'Longer texts tend to be more detailed complaints or praise',
            'action': 'Monitor text length patterns for early sentiment detection'
        },
        'kansai_ben': {
            'description': 'Regional dialect usage (Kansai area)',
            'insight': 'Regional language patterns affect sentiment expression',
            'action': 'Consider regional customization for better accuracy'
        },
        'direct_tone': {
            'description': 'Direct/explicit expression style',
            'insight': 'Direct language correlates with stronger sentiment',
            'action': 'Prioritize direct feedback for immediate response'
        }
    }
    
    summary_data = []
    for name, importance in zip(classic_names, classic_importance):
        context = business_context.get(name, {})
        summary_data.append({
            'Feature': context.get('description', name),
            'Importance Score': f"{importance:.3f}",
            'Business Insight': context.get('insight', 'N/A'),
            'Recommended Action': context.get('action', 'N/A')
        })
    
    return pd.DataFrame(summary_data)

def create_simple_comparison_chart(classic_names, classic_importance):
    """Very simple, clean comparison chart"""
    fig, ax = plt.subplots(figsize=(8, 5))
    
    labels = ['Text Length', 'Regional Dialect', 'Direct Tone']
    colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(labels)))
    
    bars = ax.barh(labels, classic_importance, color=colors, height=0.5)
    
    # Add percentage labels
    total_importance = sum(classic_importance)
    for i, (bar, value) in enumerate(zip(bars, classic_importance)):
        percentage = (value / total_importance) * 100
        ax.text(value + max(classic_importance)*0.02, bar.get_y() + bar.get_height()/2, 
                f'{percentage:.1f}%', ha='left', va='center', fontweight='bold')
    
    ax.set_xlabel('Relative Importance', fontweight='bold')
    ax.set_title('What Drives Sentiment Classification?', fontsize=14, fontweight='bold')
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.set_xlim(0, max(classic_importance) * 1.2)
    
    plt.tight_layout()
    plt.show()
    return fig

# Generate stakeholder-friendly visualizations
print("\n" + "="*60)
print("GENERATING STAKEHOLDER-FRIENDLY VISUALIZATIONS")
print("="*60)

# 1. Business impact chart
create_business_impact_chart(classic_names, classic_importance)

# 2. Executive summary table
df_executive_summary = create_executive_summary_table(classic_names, classic_importance)

# 3. Simple comparison chart
create_simple_comparison_chart(classic_names, classic_importance)

# Print executive summary
print("\n" + "="*60)
print("EXECUTIVE SUMMARY")
print("="*60)
print(df_executive_summary.to_string(index=False))

# Basic statistics for stakeholder report
print(f"\n📊 MODEL PERFORMANCE SUMMARY:")
print(f"   • Accuracy: {np.mean(df_test['sentiment'] == df_test['xgb_pred']):.1%}")
print(f"   • Samples analyzed: {len(df_test)}")
print(f"   • Regional dialect usage: {df_test['kansai_ben'].mean():.1%}")
print(f"   • Direct expressions: {df_test['direct_tone'].mean():.1%}")

# Key takeaways for business
print(f"\n🎯 KEY BUSINESS INSIGHTS:")
print(f"   1. Text length is the strongest predictor of sentiment")
print(f"   2. Regional dialect affects how sentiment is expressed")
print(f"   3. Direct language correlates with stronger sentiment")
print(f"   4. Model shows high accuracy for automated sentiment detection")

# ========================
# 12. Display Results (Test)
# ========================
display_dataframe_to_user(name="JGLUE Sentiment + Kansai-ben Analysis (Test Set)", dataframe=df_test)

In [36]:
import pandas as pd
import requests
import io
import re

# 1. Load Gen-Z Slang CSV
url_slang = "https://raw.githubusercontent.com/kaspercools/genz-dataset/refs/heads/main/genz_slang.csv"
resp_slang = requests.get(url_slang)
df_slang = pd.read_csv(io.StringIO(resp_slang.text))
slang_map = {
    str(row['keyword']).strip().lower(): str(row['description']).strip()
    for _, row in df_slang.iterrows()
    if pd.notnull(row['keyword']) and pd.notnull(row['description'])
}

# 2. Load Gen-Z Emojis CSV
url_emoji = "https://raw.githubusercontent.com/kaspercools/genz-dataset/refs/heads/main/genz_emojis.csv"
resp_emoji = requests.get(url_emoji)
df_emoji = pd.read_csv(io.StringIO(resp_emoji.text))
emoji_map = {
    str(row['emoji']).strip(): str(row['Description']).strip()
    for _, row in df_emoji.iterrows()
    if pd.notnull(row['emoji']) and pd.notnull(row['Description'])
}

# 3. Phrase variants (auto-adds only if the base is present)
variant_patterns = {
    "fleek": ["on fleek"],
    "cap": ["no cap"],
    "shade": ["throw shade"],
    "tea": ["spill the tea"],
    "key": ["low key", "high key"],
    "bestie": ["bestie vibes"],
    "grass": ["touch grass"]
}
custom_phrase_map = {}
for base, phrases in variant_patterns.items():
    if base in slang_map:
        for phrase in phrases:
            custom_phrase_map[phrase] = slang_map[base]

# 4. MANUAL BACKUPS for missing entries
manual_phrase_map = {
    "on fleek": "something that is perfect or done really well",
    # Add more phrase backups here!
}

# 5. Merge all mappings, manual > custom > slang > emoji
translation_map = {**manual_phrase_map, **custom_phrase_map, **slang_map, **emoji_map}

print("'on fleek' in translation_map?", "on fleek" in translation_map)
print("Value for 'on fleek':", translation_map.get("on fleek"))
print("Keys containing 'fleek':", [k for k in translation_map if 'fleek' in k])

def replace_slang_and_emoji(text, translation_map, verbose=False):
    if pd.isna(text):
        return ""
    text = str(text)
    mapping_used = []
    for key in sorted(translation_map.keys(), key=lambda x: -len(x)):
        val = translation_map[key]
        if re.match(r'^\W+$', key):
            if key in text and verbose:
                mapping_used.append((key, val))
            text = text.replace(key, val)
        else:
            pattern = r'(?i)(?<!\w){}(?=\W|$)'.format(re.escape(key))
            if re.search(pattern, text):
                if verbose:
                    mapping_used.append((key, val))
                text = re.sub(pattern, val, text)
    if verbose:
        print("Mappings used in this sentence:")
        if mapping_used:
            for k, v in mapping_used:
                print(f"  '{k}' => '{v}'")
        else:
            print("  (None)")
    return text

demo_sentence = "I'm dead 😂, this party is on fleek!"
print("\n--- Demo Replacement ---")
print("Original:", demo_sentence)
translated = replace_slang_and_emoji(demo_sentence, translation_map, verbose=True)
print("Translated:", translated)


'on fleek' in translation_map? True
Value for 'on fleek': something that is perfect or done really well
Keys containing 'fleek': ['on fleek']

--- Demo Replacement ---
Original: I'm dead 😂, this party is on fleek!
Mappings used in this sentence:
  'on fleek' => 'something that is perfect or done really well'
Translated: I'm dead 😂, this party is something that is perfect or done really well!


In [None]:
import pandas as pd
import numpy as np
import requests
import os
import tarfile
import glob
from sklearn.preprocessing import LabelEncoder

# ========================
# 1. Download & Load livedoor-news-corpus with better error handling
# ========================

def download_livedoor_corpus():
    """Download livedoor corpus with multiple fallback strategies"""
    
    # Multiple possible URLs for the dataset
    urls = [
        "https://www.rondhuit.com/download/ldcc-20140209.tar.gz",
        "http://www.rondhuit.com/download/ldcc-20140209.tar.gz",
        # Alternative mirrors if needed
    ]
    
    fname = "ldcc-20140209.tar.gz"
    
    # Remove existing file if it's corrupted
    if os.path.exists(fname):
        print(f"Removing existing {fname} to retry download...")
        os.remove(fname)
    
    for i, url in enumerate(urls):
        try:
            print(f"Trying URL {i+1}: {url}")
            
            # Enhanced headers to mimic a real browser
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate',
                'Connection': 'keep-alive',
            }
            
            response = requests.get(url, headers=headers, timeout=30, stream=True)
            response.raise_for_status()
            
            # Check if response is HTML (error page)
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' in content_type:
                print(f"URL {i+1} returned HTML instead of file")
                continue
                
            print(f"Downloading from URL {i+1}...")
            total_size = int(response.headers.get('content-length', 0))
            downloaded = 0
            
            with open(fname, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            percent = (downloaded / total_size) * 100
                            print(f"\rProgress: {percent:.1f}%", end="", flush=True)
            
            print("\nDownload complete. Verifying file...")
            
            # Verify the downloaded file
            if verify_gzip_file(fname):
                print("File verification successful!")
                return True
            else:
                print("File verification failed. Trying next URL...")
                os.remove(fname)
                continue
                
        except Exception as e:
            print(f"Error with URL {i+1}: {e}")
            if os.path.exists(fname):
                os.remove(fname)
            continue
    
    return False

def verify_gzip_file(fname):
    """Verify that the file is a valid gzip archive"""
    try:
        with open(fname, "rb") as f:
            # Check gzip magic number
            sig = f.read(2)
            if sig != b'\x1f\x8b':
                print(f"Invalid gzip signature: {sig}")
                return False
            
            # Try to read the first few bytes to ensure it's not corrupted
            f.seek(0)
            with tarfile.open(fname, "r") as tar:
                # Try to list contents
                members = tar.getnames()[:5]  # Just check first 5 files
                print(f"Archive contains {len(tar.getnames())} files. Sample: {members}")
                return True
                
    except Exception as e:
        print(f"File verification error: {e}")
        return False

def create_sample_data():
    """Create sample Japanese news data if download fails"""
    print("Creating sample data for demonstration...")
    
    # Sample Japanese news texts (simplified)
    sample_data = {
        'dokujo-tsushin': [
            "今日は良い天気でした。公園で散歩を楽しみました。",
            "新しいカフェがオープンしました。コーヒーがとても美味しいです。",
            "友達と映画を見に行きました。とても面白い映画でした。"
        ],
        'it-life-hack': [
            "新しいプログラミング言語を学習中です。難しいですが楽しいです。",
            "最新のスマートフォンが発売されました。性能が大幅に向上しています。",
            "クラウドサービスの活用方法について説明します。"
        ],
        'kaden-channel': [
            "最新の冷蔵庫は省エネ機能が充実しています。",
            "新型エアコンの性能比較を行いました。",
            "掃除機の選び方について詳しく解説します。"
        ],
        'livedoor-homme': [
            "男性向けファッションの最新トレンドをご紹介します。",
            "健康的な生活習慣について考えてみましょう。",
            "おすすめのヘアスタイルをご提案します。"
        ],
        'movie-enter': [
            "今週公開の映画をレビューします。アクション映画が特におすすめです。",
            "有名俳優の最新インタビューをお届けします。",
            "映画祭の受賞作品について詳しく紹介します。"
        ],
        'peachy': [
            "美容に関する最新情報をお届けします。",
            "スキンケアの正しい方法について説明します。",
            "季節に合わせたメイクアップのコツをご紹介します。"
        ],
        'smax': [
            "最新スマートフォンの詳細レビューをお届けします。",
            "モバイル業界の最新動向について解説します。",
            "便利なアプリの使い方をご紹介します。"
        ],
        'sports-watch': [
            "今日の野球の試合結果をお伝えします。",
            "サッカーワールドカップの最新情報です。",
            "オリンピックの注目競技について解説します。"
        ],
        'topic-news': [
            "政治の最新ニュースをお届けします。",
            "経済状況について詳しく分析します。",
            "社会問題について考察します。"
        ]
    }
    
    all_texts, all_labels = [], []
    for label, texts in sample_data.items():
        for text in texts:
            # Duplicate each text multiple times to create more samples
            for i in range(40):  # Create 40 samples per original text
                all_texts.append(f"{text} サンプル{i+1}")
                all_labels.append(label)
    
    return pd.DataFrame({"text": all_texts, "label": all_labels})

# ========================
# Main execution
# ========================

print("Attempting to download livedoor-news-corpus...")

# Try to download the real dataset
if download_livedoor_corpus():
    # Extract if not already done
    if not os.path.exists("text"):
        print("Extracting...")
        with tarfile.open("ldcc-20140209.tar.gz", "r") as tar:
            tar.extractall()
        print("Extraction complete.")
    else:
        print("Directory 'text/' already exists.")
    
    print("Sample of extracted category dirs:", glob.glob("text/*"))
    
    # Parse all files to DataFrame
    all_texts, all_labels = [], []
    for cat_folder in glob.glob("text/*"):
        cat = os.path.basename(cat_folder)
        if not os.path.isdir(cat_folder):
            continue
        for file in glob.glob(f"{cat_folder}/*.txt"):
            try:
                with open(file, encoding="utf-8") as f:
                    lines = f.readlines()
                    if len(lines) >= 3:  # [url, timestamp, title/body...]
                        text = "".join(lines[2:]).strip()
                        if text:  # Only add non-empty texts
                            all_texts.append(text)
                            all_labels.append(cat)
            except Exception as e:
                print(f"Error reading file {file}: {e}")
                continue
    
    df = pd.DataFrame({"text": all_texts, "label": all_labels})
    print("Successfully loaded livedoor-news-corpus:", df.shape)
    
else:
    print("Failed to download the dataset. Using sample data instead.")
    df = create_sample_data()
    print("Created sample dataset:", df.shape)

# Display dataset info
print("\nDataset label distribution:")
print(df['label'].value_counts().sort_index())

# Sample and limit data for demo (remove these lines for full dataset)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df.groupby('label').head(120)  # Limit per class for memory/speed

# Encode labels to integer
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
num_labels = df['label_id'].nunique()

print(f"\nFinal dataset shape: {df.shape}")
print("Label mapping:", dict(zip(le.classes_, range(num_labels))))
print("\nFirst few samples:")
print(df.head(3))

# Save the processed data
df.to_csv('livedoor_processed.csv', index=False)
print("\nProcessed data saved to 'livedoor_processed.csv'")

Attempting to download livedoor-news-corpus...
Trying URL 1: https://www.rondhuit.com/download/ldcc-20140209.tar.gz
Downloading from URL 1...
Progress: 356.6%
Download complete. Verifying file...
Invalid gzip signature: b'te'
File verification failed. Trying next URL...
Trying URL 2: http://www.rondhuit.com/download/ldcc-20140209.tar.gz
Downloading from URL 2...
Progress: 356.6%
Download complete. Verifying file...
Invalid gzip signature: b'te'
File verification failed. Trying next URL...
Failed to download the dataset. Using sample data instead.
Creating sample data for demonstration...
Created sample dataset: (1080, 2)

Dataset label distribution:
label
dokujo-tsushin    120
it-life-hack      120
kaden-channel     120
livedoor-homme    120
movie-enter       120
peachy            120
smax              120
sports-watch      120
topic-news        120
Name: count, dtype: int64

Final dataset shape: (1080, 3)
Label mapping: {'dokujo-tsushin': 0, 'it-life-hack': 1, 'kaden-channel': 2, 'liv

In [51]:
df

Unnamed: 0,text,label,label_id
0,健康的な生活習慣について考えてみましょう。 サンプル24,livedoor-homme,3
1,有名俳優の最新インタビューをお届けします。 サンプル29,movie-enter,4
2,最新スマートフォンの詳細レビューをお届けします。 サンプル12,smax,6
3,映画祭の受賞作品について詳しく紹介します。 サンプル29,movie-enter,4
4,社会問題について考察します。 サンプル12,topic-news,8
...,...,...,...
1075,掃除機の選び方について詳しく解説します。 サンプル11,kaden-channel,2
1076,おすすめのヘアスタイルをご提案します。 サンプル27,livedoor-homme,3
1077,新しいプログラミング言語を学習中です。難しいですが楽しいです。 サンプル2,it-life-hack,1
1078,社会問題について考察します。 サンプル5,topic-news,8


In [60]:
import pandas as pd
import numpy as np
import requests
import os
import tarfile
import glob

# ========================
# 1. Download & Load livedoor-news-corpus
# ========================
print("Downloading livedoor-news-corpus...")
url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
fname = "ldcc-20140209.tar.gz"

def download_if_needed(url, fname):
    if not os.path.exists(fname):
        print("Downloading...")
        headers = {'User-Agent': 'Mozilla/5.0'}
        with requests.get(url, stream=True, headers=headers) as r:
            r.raise_for_status()
            with open(fname, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print("Download complete.")
    else:
        print("File already exists:", fname)
    # Just check it's NOT html (error page)
    with open(fname, "rb") as f:
        head = f.read(512)
        if b'<html' in head.lower() or b'<title' in head.lower():
            raise RuntimeError(f"{fname} looks like an HTML error page, not a tar file! Delete and retry.")

download_if_needed(url, fname)

# Extract as **plain tar**, not gzip!
if not os.path.exists("text"):
    print("Extracting...")
    try:
        with tarfile.open(fname, "r") as tar:  # Note: "r", not "r:gz"
            tar.extractall()
        print("Extraction complete.")
    except Exception as e:
        print("Extraction failed!", e)
else:
    print("Directory 'text/' already exists.")

print("Sample of extracted category dirs:", glob.glob("text/*"))

# ========================
# 2. Parse All Files to DataFrame
# ========================
all_texts, all_labels = [], []
for cat_folder in glob.glob("text/*"):
    cat = os.path.basename(cat_folder)
    if not os.path.isdir(cat_folder):
        continue
    for file in glob.glob(f"{cat_folder}/*.txt"):
        with open(file, encoding="utf-8") as f:
            lines = f.readlines()
            if len(lines) >= 3:  # [url, timestamp, title/body...]
                text = "".join(lines[2:]).strip()
                all_texts.append(text)
                all_labels.append(cat)

df = pd.DataFrame({"text": all_texts, "label": all_labels})
print("Loaded livedoor-news-corpus:", df.shape)
print(df['label'].value_counts().sort_index())

# For demo: sample a small subset (for full training, remove .sample)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df.groupby('label').head(120)  # Limit per class for memory/speed

# Encode labels to integer
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])
num_labels = df['label_id'].nunique()
print("Label mapping:", dict(zip(le.classes_, range(num_labels))))
print(df.head(3))


Downloading livedoor-news-corpus...
File already exists: ldcc-20140209.tar.gz
Directory 'text/' already exists.
Sample of extracted category dirs: ['text/movie-enter', 'text/it-life-hack', 'text/kaden-channel', 'text/topic-news', 'text/livedoor-homme', 'text/peachy', 'text/sports-watch', 'text/dokujo-tsushin', 'text/CHANGES.txt', 'text/README.txt', 'text/smax']
Loaded livedoor-news-corpus: (4976, 2)
label
kaden-channel     207
livedoor-homme    512
movie-enter       871
peachy            843
smax              871
sports-watch      901
topic-news        771
Name: count, dtype: int64
Label mapping: {'kaden-channel': 0, 'livedoor-homme': 1, 'movie-enter': 2, 'peachy': 3, 'smax': 4, 'sports-watch': 5, 'topic-news': 6}
                                                text         label  label_id
0  【Sports Watch】フジテレビ・スポーツ番組の「韓日戦」表記の理由とは\n先月下旬、...  sports-watch         5
1  巨大都市ニューヨーク各所に出現した“どこでもドア”\n　自宅やオフィスなど、我々は毎日数え切...   movie-enter         2
2  5,000万個販売した大ヒットロールケーキに「紅茶味」が新登場\n　いつでもおウチが

In [2]:
import os
import requests
import pandas as pd

wrime_url = "https://raw.githubusercontent.com/ids-cv/wrime/refs/heads/master/wrime-ver1.tsv"
wrime_path = "wrime-ver1.tsv"
if not os.path.exists(wrime_path):
    r = requests.get(wrime_url)
    open(wrime_path, "wb").write(r.content)
df_wrime = pd.read_csv(wrime_path, sep="\t")
df_wrime = df_wrime.dropna(subset=["Sentence"])
df_wrime

Unnamed: 0,Sentence,UserID,Datetime,Train/Dev/Test,Writer_Joy,Writer_Sadness,Writer_Anticipation,Writer_Surprise,Writer_Anger,Writer_Fear,...,Reader3_Disgust,Reader3_Trust,Avg. Readers_Joy,Avg. Readers_Sadness,Avg. Readers_Anticipation,Avg. Readers_Surprise,Avg. Readers_Anger,Avg. Readers_Fear,Avg. Readers_Disgust,Avg. Readers_Trust
0,ぼけっとしてたらこんな時間｡チャリあるから食べにでたいのに…,1,2012/07/31 23:48,train,0,1,2,1,1,0,...,1,0,0,2,0,0,0,0,0,0
1,今日の月も白くて明るい。昨日より雲が少なくてキレイな? と立ち止まる帰り道｡チャリなし生活も...,1,2012/08/02 23:09,train,3,0,3,0,0,0,...,0,1,1,0,0,2,0,0,0,0
2,早寝するつもりが飲み物がなくなりコンビニへ｡ん､今日、風が涼しいな。,1,2012/08/05 00:50,train,1,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,眠い、眠れない。,1,2012/08/08 01:36,train,0,2,1,0,0,1,...,2,0,0,1,0,0,0,0,1,0
4,ただいま? って新体操してるやん!外食する気満々で家に何もないのに!テレビから離れられない…!,1,2012/08/09 22:24,train,2,1,3,2,0,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43195,真夜中にふと思い立ち、ノートPCを持って部屋を出て、ダイニングで仕事したらすんごい捗った。\...,80,2020/09/15 08:01,train,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
43196,ぐっどこんでぃしょん。\n心も頭もクリア。\n秋分の日のおかげかな？\n人と自然としっとり過...,80,2020/09/22 01:52,train,1,0,1,0,0,0,...,0,0,2,0,2,0,0,0,0,0
43197,朝から免許の更新へ。\n90分で終わり、出口へ向かうと献血の呼びかけが。\nみんな通り過ぎて...,80,2020/09/23 22:32,train,2,0,2,1,0,0,...,0,0,2,0,0,0,0,0,0,0
43198,夜も更けて参りましたが、食後のコーヒーが飲みたいのでドリップ開始…\n\nぼんやり秋の夜長を...,80,2020/10/11 00:12,train,2,0,1,0,0,0,...,0,0,0,0,2,0,0,0,0,0
