In [None]:
# Enhanced Japanese ABSA Pipeline with MultiLLM + LoRA (no embedding extractor, LLM-only)
# Requirements: torch, transformers, peft, datasets, matplotlib, seaborn, pandas, numpy, google-cloud-aiplatform

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gc
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)

from transformers import (
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType

try:
    from datasets import Dataset
except ImportError:
    raise ImportError("You need to 'pip install datasets'")

# Vertex AI imports (only for data generation)
import vertexai
from vertexai.preview.generative_models import GenerativeModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    project_id: str = "your-project-id"
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    n_splits: int = 5
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 256
    model_candidates: list = None
    def __post_init__(self):
        if self.model_candidates is None:
            self.model_candidates = [
                "cl-tohoku/bert-base-japanese-whole-word-masking",
                "rinna/japanese-roberta-base",
                "studio-ousia/luke-japanese-base-lite",
                "xlm-roberta-base"
            ]

@dataclass
class AspectConfig:
    aspects: Dict[str, List[str]] = None
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

class VertexAIDataGenerator:
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    def _initialize_model(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        prompts = self._create_generation_prompts()
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            try:
                response = self.generative_model.generate_content(
                    prompt, generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        df = pd.DataFrame(generated_data)
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        df = pd.concat([df, manual_df], ignore_index=True)
        logger.info(f"Generated {len(df)} training samples")
        return df
    def _create_generation_prompts(self) -> List[str]:
        prompts = []
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.
Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}
Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality
Generate 20 similar reviews:
"""
                prompts.append(prompt)
        return prompts
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        samples = []
        lines = response_text.split('\n')
        current_review = None
        current_sentiment = None
        current_aspect = None
        for line in lines:
            line = line.strip()
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                if current_review and current_sentiment and current_aspect:
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    if len(samples) >= max_samples:
                        break
        return samples
    def _create_manual_samples(self) -> List[Dict]:
        manual_samples = [
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        return manual_samples

class EnhancedBusinessInsightExtractor:
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {'negative': '#FF6B6B','neutral': '#FFD93D','positive': '#6BCF7F'}
    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        metrics = {}
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
        return metrics
    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }
    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        avg_text_length = df['text_length'].mean()
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

class AdvancedMultiTaskModel(nn.Module):
    def __init__(self, base_model, num_labels=3, aux_tasks=None, dropout=0.2):
        super().__init__()
        self.base_model = base_model
        self.config = base_model.config
        self.dropout = nn.Dropout(dropout)
        hidden_size = base_model.config.hidden_size
        self.main_head = nn.Linear(hidden_size, num_labels)
        self.aux_heads = nn.ModuleDict()
        if aux_tasks:
            for name, classes in aux_tasks.items():
                self.aux_heads[name] = nn.Linear(hidden_size, classes)
    def forward(self, input_ids, attention_mask=None, **kwargs):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        pooled = self.dropout(pooled)
        out = {'logits': self.main_head(pooled)}
        for name, head in self.aux_heads.items():
            out[name + '_logits'] = head(pooled)
        return out

class VertexAIJapaneseABSAPipeline:
    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)
        self.model = None
        self.tokenizer = None
        self.trainer = None

    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info("Generating training data...")
        df = self.data_generator.generate_training_data(num_samples)
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        logger.info(f"Generated {len(df)} training samples")
        return df

    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Extracting aspect flags for {len(df)} samples...")
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(pattern, na=False, regex=True).astype(int)
        df['text_length'] = df['review_text'].str.len()
        return df

    def prepare_aux_labels(self, df: pd.DataFrame):
        # (For demonstration: create aux labels. Replace with real data if available)
        for emo in ['joy', 'sadness', 'anger']:
            df[f"high_{emo}"] = np.random.randint(0, 2, size=len(df))
        return df

    def prepare_datasets(self, df, tokenizer):
        def tokenize_function(examples):
            tok = tokenizer(
                examples['review_text'],
                truncation=True,
                padding='max_length',
                max_length=128
            )
            tok['labels'] = examples['sentiment']
            tok['joy_labels'] = examples['high_joy']
            tok['sadness_labels'] = examples['high_sadness']
            tok['anger_labels'] = examples['high_anger']
            return tok
        dataset = Dataset.from_pandas(df[['review_text', 'sentiment', 'high_joy', 'high_sadness', 'high_anger']])
        dataset = dataset.map(tokenize_function, batched=True)
        columns = ['input_ids', 'attention_mask', 'labels', 'joy_labels', 'sadness_labels', 'anger_labels']
        dataset.set_format(type='torch', columns=columns)
        return dataset

    def setup_model_and_tokenizer(self, model_name, num_labels, aux_tasks=None):
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = AutoModel.from_pretrained(model_name)
        model = AdvancedMultiTaskModel(base_model, num_labels=num_labels, aux_tasks=aux_tasks)
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )
        model = get_peft_model(model, lora_config)
        model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
        return tokenizer, model

    def train_evaluate_multillm_lora(self, df):
        # Prepare data and split
        df = self.prepare_aux_labels(df)
        train_df, test_df = train_test_split(df, test_size=self.model_config.test_size, stratify=df['sentiment'], random_state=42)
        # Features/plots
        self.insight_extractor.executive_summary(train_df)
        self.insight_extractor.generate_business_recommendations(train_df)
        num_labels = 3
        aux_tasks = {'joy': 2, 'sadness': 2, 'anger': 2}
        best_f1 = -1
        best_result = None
        for model_name in self.model_config.model_candidates:
            print(f"\n===== Training {model_name} =====")
            tokenizer, model = self.setup_model_and_tokenizer(model_name, num_labels, aux_tasks)
            train_dataset = self.prepare_datasets(train_df, tokenizer)
            test_dataset = self.prepare_datasets(test_df, tokenizer)
            class MultiTaskTrainer(Trainer):
                def compute_loss(self, model, inputs, *args, **kwargs):
                    labels = inputs.pop("labels")
                    joy_labels = inputs.pop("joy_labels")
                    sadness_labels = inputs.pop("sadness_labels")
                    anger_labels = inputs.pop("anger_labels")
                    outputs = model(**inputs)
                    loss = F.cross_entropy(outputs['logits'], labels)
                    loss += 0.2 * F.cross_entropy(outputs['joy_logits'], joy_labels)
                    loss += 0.2 * F.cross_entropy(outputs['sadness_logits'], sadness_labels)
                    loss += 0.2 * F.cross_entropy(outputs['anger_logits'], anger_labels)
                    return (loss, outputs) if kwargs.get('return_outputs', False) else loss
            training_args = TrainingArguments(
                output_dir="./results",
                num_train_epochs=2,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                eval_strategy="epoch",
                save_strategy="no",
                learning_rate=1e-4,
                logging_dir="./logs",
                logging_steps=100,
                fp16=torch.cuda.is_available(),
                report_to=[],
                dataloader_num_workers=0,
                max_grad_norm=1.0,
                remove_unused_columns=False,
                label_names=["labels", "joy_labels", "sadness_labels", "anger_labels"],
            )
            data_collator = DataCollatorWithPadding(tokenizer, padding=True, max_length=128)
            trainer = MultiTaskTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=data_collator,
                tokenizer=tokenizer,
            )
            trainer.train()
            preds = trainer.predict(test_dataset)
            y_pred = np.argmax(preds.predictions[0], axis=1)
            y_true = test_df['sentiment'].values
            report = classification_report(y_true, y_pred, output_dict=True, target_names=["negative", "neutral", "positive"])
            macro_f1 = report['macro avg']['f1-score']
            print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))
            if macro_f1 > best_f1:
                best_f1 = macro_f1
                best_result = {
                    "model_name": model_name,
                    "trainer": trainer,
                    "tokenizer": tokenizer,
                    "model": model,
                    "test_df": test_df,
                    "y_pred": y_pred,
                    "y_true": y_true
                }
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        print(f"\n===== Best model: {best_result['model_name']} (macro F1={best_f1:.4f}) =====")
        cm = confusion_matrix(best_result["y_true"], best_result["y_pred"])
        ConfusionMatrixDisplay(cm, display_labels=["neg", "neu", "pos"]).plot(cmap="Blues")
        plt.title(f"Confusion Matrix ({best_result['model_name']})")
        plt.tight_layout()
        plt.show()
        self.model = best_result['model']
        self.tokenizer = best_result['tokenizer']
        self.trainer = best_result['trainer']
        return best_result

    def pipeline(self, num_samples: int = 1000):
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df)
        best_result = self.train_evaluate_multillm_lora(df)
        print("\nPipeline finished! Best LLM+LoRA saved in memory.")

# --------------- Main Entrypoint ---------------
if __name__ == "__main__":
    vertex_config = VertexAIConfig(
        project_id="able-balm-454718-n8",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        use_gpu=True
    )
    model_config = ModelConfig()
    aspect_config = AspectConfig()
    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    pipeline.pipeline(num_samples=1200)


In [None]:
# Enhanced Japanese ABSA Pipeline with MultiLLM + LoRA + RAG (with all business logic, plots, and analytics preserved)
# Requirements: torch, transformers, peft, datasets, matplotlib, seaborn, pandas, numpy, google-cloud-aiplatform, sentence-transformers, faiss-cpu

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import gc
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.preprocessing import LabelEncoder

from transformers import (
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType

try:
    from datasets import Dataset
except ImportError:
    raise ImportError("You need to 'pip install datasets'")

# RAG imports
try:
    from sentence_transformers import SentenceTransformer
    import faiss
except ImportError:
    raise ImportError("You need to 'pip install sentence-transformers faiss-cpu'")

# Vertex AI imports
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    project_id: str = "your-project-id"
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"
    model_candidates: list = None

    def __post_init__(self):
        if self.model_candidates is None:
            self.model_candidates = [
                "cl-tohoku/bert-base-japanese-whole-word-masking",
                "rinna/japanese-roberta-base",
                "studio-ousia/luke-japanese-base-lite",
                "xlm-roberta-base"
            ]

@dataclass
class AspectConfig:
    aspects: Dict[str, List[str]] = None
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

# ==========================
# Embedding & Data Generation
# ==========================

class VertexAIEmbeddingExtractor:
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    def _initialize_vertex_ai(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(self.model_config.vertex_embedding_model)
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                embeddings = []
                batch_size = 5
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                return np.array(embeddings)
            else:
                return self.embedding_model.encode(
                    texts, show_progress_bar=True, batch_size=self.model_config.batch_size, normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    def _initialize_model(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        prompts = self._create_generation_prompts()
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            try:
                response = self.generative_model.generate_content(
                    prompt, generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        df = pd.DataFrame(generated_data)
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        df = pd.concat([df, manual_df], ignore_index=True)
        logger.info(f"Generated {len(df)} training samples")
        return df
    def _create_generation_prompts(self) -> List[str]:
        prompts = []
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.
Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}
Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality
Generate 20 similar reviews:
"""
                prompts.append(prompt)
        return prompts
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        samples = []
        lines = response_text.split('\n')
        current_review = None
        current_sentiment = None
        current_aspect = None
        for line in lines:
            line = line.strip()
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                if current_review and current_sentiment and current_aspect:
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    if len(samples) >= max_samples:
                        break
        return samples
    def _create_manual_samples(self) -> List[Dict]:
        manual_samples = [
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        return manual_samples

# =============================
# Business Insight & Analytics
# =============================

class EnhancedBusinessInsightExtractor:
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {'negative': '#FF6B6B','neutral': '#FFD93D','positive': '#6BCF7F'}

    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        metrics = {}
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
        return metrics

    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }

    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        avg_text_length = df['text_length'].mean()
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

# ===================================
# LLM+LoRA advanced model and pipeline
# ===================================

class AdvancedMultiTaskModel(nn.Module):
    def __init__(self, base_model, num_labels=3, aux_tasks=None, dropout=0.2):
        super().__init__()
        self.base_model = base_model
        self.config = base_model.config
        self.dropout = nn.Dropout(dropout)
        hidden_size = base_model.config.hidden_size
        self.main_head = nn.Linear(hidden_size, num_labels)
        self.aux_heads = nn.ModuleDict()
        if aux_tasks:
            for name, classes in aux_tasks.items():
                self.aux_heads[name] = nn.Linear(hidden_size, classes)
    def forward(self, input_ids, attention_mask=None, **kwargs):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        pooled = self.dropout(pooled)
        out = {'logits': self.main_head(pooled)}
        for name, head in self.aux_heads.items():
            out[name + '_logits'] = head(pooled)
        return out

class VertexAIJapaneseABSAPipeline:
    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(vertex_config, model_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)
        self.model = None
        self.tokenizer = None
        self.trainer = None

    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info("Generating training data...")
        df = self.data_generator.generate_training_data(num_samples)
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        logger.info(f"Generated {len(df)} training samples")
        return df

    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Extracting features for {len(df)} samples...")
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(pattern, na=False, regex=True).astype(int)
        df['text_length'] = df['review_text'].str.len()
        return df

    def prepare_aux_labels(self, df: pd.DataFrame):
        # (For demonstration: create aux labels. Replace with real data if available)
        for emo in ['joy', 'sadness', 'anger']:
            df[f"high_{emo}"] = np.random.randint(0, 2, size=len(df))
        return df

    def prepare_datasets(self, df, tokenizer):
        def tokenize_function(examples):
            tok = tokenizer(
                examples['review_text'],
                truncation=True,
                padding='max_length',
                max_length=128
            )
            tok['labels'] = examples['sentiment']
            tok['joy_labels'] = examples['high_joy']
            tok['sadness_labels'] = examples['high_sadness']
            tok['anger_labels'] = examples['high_anger']
            return tok
        dataset = Dataset.from_pandas(df[['review_text', 'sentiment', 'high_joy', 'high_sadness', 'high_anger']])
        dataset = dataset.map(tokenize_function, batched=True)
        columns = ['input_ids', 'attention_mask', 'labels', 'joy_labels', 'sadness_labels', 'anger_labels']
        dataset.set_format(type='torch', columns=columns)
        return dataset

    def setup_model_and_tokenizer(self, model_name, num_labels, aux_tasks=None):
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = AutoModel.from_pretrained(model_name)
        model = AdvancedMultiTaskModel(base_model, num_labels=num_labels, aux_tasks=aux_tasks)
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )
        model = get_peft_model(model, lora_config)
        model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
        return tokenizer, model

    def train_evaluate_multillm_lora(self, df):
        df = self.prepare_aux_labels(df)
        train_df, test_df = train_test_split(df, test_size=self.model_config.test_size, stratify=df['sentiment'], random_state=42)
        self.insight_extractor.executive_summary(train_df)
        self.insight_extractor.generate_business_recommendations(train_df)
        num_labels = 3
        aux_tasks = {'joy': 2, 'sadness': 2, 'anger': 2}
        best_f1 = -1
        best_result = None
        for model_name in self.model_config.model_candidates:
            print(f"\n===== Training {model_name} =====")
            tokenizer, model = self.setup_model_and_tokenizer(model_name, num_labels, aux_tasks)
            train_dataset = self.prepare_datasets(train_df, tokenizer)
            test_dataset = self.prepare_datasets(test_df, tokenizer)
            class MultiTaskTrainer(Trainer):
                def compute_loss(self, model, inputs, *args, **kwargs):
                    labels = inputs.pop("labels")
                    joy_labels = inputs.pop("joy_labels")
                    sadness_labels = inputs.pop("sadness_labels")
                    anger_labels = inputs.pop("anger_labels")
                    outputs = model(**inputs)
                    loss = F.cross_entropy(outputs['logits'], labels)
                    loss += 0.2 * F.cross_entropy(outputs['joy_logits'], joy_labels)
                    loss += 0.2 * F.cross_entropy(outputs['sadness_logits'], sadness_labels)
                    loss += 0.2 * F.cross_entropy(outputs['anger_logits'], anger_labels)
                    return (loss, outputs) if kwargs.get('return_outputs', False) else loss
            training_args = TrainingArguments(
                output_dir="./results",
                num_train_epochs=2,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                eval_strategy="epoch",
                save_strategy="no",
                learning_rate=1e-4,
                logging_dir="./logs",
                logging_steps=100,
                fp16=torch.cuda.is_available(),
                report_to=[],
                dataloader_num_workers=0,
                max_grad_norm=1.0,
                remove_unused_columns=False,
                label_names=["labels", "joy_labels", "sadness_labels", "anger_labels"],
            )
            data_collator = DataCollatorWithPadding(tokenizer, padding=True, max_length=128)
            trainer = MultiTaskTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=data_collator,
                tokenizer=tokenizer,
            )
            trainer.train()
            preds = trainer.predict(test_dataset)
            y_pred = np.argmax(preds.predictions[0], axis=1)
            y_true = test_df['sentiment'].values
            report = classification_report(y_true, y_pred, output_dict=True, target_names=["negative", "neutral", "positive"])
            macro_f1 = report['macro avg']['f1-score']
            print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))
            if macro_f1 > best_f1:
                best_f1 = macro_f1
                best_result = {
                    "model_name": model_name,
                    "trainer": trainer,
                    "tokenizer": tokenizer,
                    "model": model,
                    "test_df": test_df,
                    "y_pred": y_pred,
                    "y_true": y_true
                }
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        print(f"\n===== Best model: {best_result['model_name']} (macro F1={best_f1:.4f}) =====")
        cm = confusion_matrix(best_result["y_true"], best_result["y_pred"])
        ConfusionMatrixDisplay(cm, display_labels=["neg", "neu", "pos"]).plot(cmap="Blues")
        plt.title(f"Confusion Matrix ({best_result['model_name']})")
        plt.tight_layout()
        plt.show()
        self.model = best_result['model']
        self.tokenizer = best_result['tokenizer']
        self.trainer = best_result['trainer']
        self.df_train = train_df # <- for RAG
        return best_result

    def pipeline(self, num_samples: int = 1000):
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df)
        best_result = self.train_evaluate_multillm_lora(df)
        print("\nPipeline finished! Best LLM+LoRA saved in memory.")
        return best_result

# ----------------- RAG Inference -----------------
class RAGInference:
    """ Retrieval-Augmented Generation for ABSA sentiment inference """
    def __init__(
        self, model, tokenizer, df_corpus, embedding_extractor, top_k=3
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.df_corpus = df_corpus.reset_index(drop=True)
        self.top_k = top_k
        self.embedding_extractor = embedding_extractor
        print("Building RAG vector index from corpus...")
        corpus_embeddings = self.embedding_extractor.get_embeddings(list(self.df_corpus['review_text']))
        self.index = faiss.IndexFlatIP(corpus_embeddings.shape[1])
        self.index.add(corpus_embeddings.astype(np.float32))
        self.corpus_embeddings = corpus_embeddings

    def retrieve(self, query):
        q_emb = self.embedding_extractor.get_embeddings([query]).astype(np.float32)
        D, I = self.index.search(q_emb, self.top_k)
        return [self.df_corpus.iloc[i]['review_text'] for i in I[0]]

    def predict(self, text):
        retrieved = self.retrieve(text)
        context = " ".join(retrieved)
        input_text = f"{text} [CONTEXT] {context}"
        tokens = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        tokens = {k: v.to(next(self.model.parameters()).device) for k, v in tokens.items()}
        with torch.no_grad():
            logits = self.model(**tokens)['logits']
        pred = logits.argmax(dim=-1).item()
        return pred, retrieved

# --------------- Main Entrypoint ---------------
if __name__ == "__main__":
    vertex_config = VertexAIConfig(
        project_id="able-balm-454718-n8",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        use_gpu=True
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()
    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    pipeline.pipeline(num_samples=1200)

    # ---- RAG inference (example) ----
    print("\n--- Initializing RAG inference wrapper with vector DB embeddings ---")
    rag = RAGInference(
        pipeline.model, pipeline.tokenizer,
        pipeline.df_train, pipeline.embedding_extractor, top_k=3
    )

    example_text = "この商品はとても便利ですが、品質が少し心配です。"
    pred, retrieved = rag.predict(example_text)
    print(f"\nRAG Sentiment prediction: {['negative','neutral','positive'][pred]}")
    print(f"Retrieved context:\n- " + "\n- ".join(retrieved))


In [None]:
# Enhanced Japanese ABSA Pipeline with Vertex AI + MultiLLM + LoRA + RAG (Cloud-Ready)
# Requirements: torch, transformers, peft, datasets, matplotlib, seaborn, pandas, numpy,
# google-cloud-aiplatform, sentence-transformers, faiss-cpu

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import gc
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder

from transformers import (
    AutoModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType

from datasets import Dataset

from sentence_transformers import SentenceTransformer
import faiss

# Vertex AI imports
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    project_id: str = "your-project-id"
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"
    model_candidates: list = None

    def __post_init__(self):
        if self.model_candidates is None:
            self.model_candidates = [
                "cl-tohoku/bert-base-japanese-whole-word-masking",
                "rinna/japanese-roberta-base",
                "studio-ousia/luke-japanese-base-lite",
                "xlm-roberta-base"
            ]

@dataclass
class AspectConfig:
    aspects: Dict[str, List[str]] = None
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

# ==========================
# Embedding & Data Generation
# ==========================

class VertexAIEmbeddingExtractor:
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    def _initialize_vertex_ai(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(self.model_config.vertex_embedding_model)
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                embeddings = []
                batch_size = 5
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                return np.array(embeddings)
            else:
                return self.embedding_model.encode(
                    texts, show_progress_bar=True, batch_size=self.model_config.batch_size, normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    def _initialize_model(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        prompts = self._create_generation_prompts()
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            try:
                response = self.generative_model.generate_content(
                    prompt, generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        df = pd.DataFrame(generated_data)
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        df = pd.concat([df, manual_df], ignore_index=True)
        logger.info(f"Generated {len(df)} training samples")
        return df
    def _create_generation_prompts(self) -> List[str]:
        prompts = []
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.
Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}
Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality
Generate 20 similar reviews:
"""
                prompts.append(prompt)
        return prompts
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        samples = []
        lines = response_text.split('\n')
        current_review = None
        current_sentiment = None
        current_aspect = None
        for line in lines:
            line = line.strip()
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                if current_review and current_sentiment and current_aspect:
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    if len(samples) >= max_samples:
                        break
        return samples
    def _create_manual_samples(self) -> List[Dict]:
        manual_samples = [
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        return manual_samples

# =============================
# Business Insight & Analytics
# =============================

class EnhancedBusinessInsightExtractor:
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {'negative': '#FF6B6B','neutral': '#FFD93D','positive': '#6BCF7F'}

    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        metrics = {}
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
        return metrics

    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }

    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        avg_text_length = df['text_length'].mean()
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

# ===================================
# LLM+LoRA advanced model and pipeline
# ===================================

class AdvancedMultiTaskModel(nn.Module):
    def __init__(self, base_model, num_labels=3, aux_tasks=None, dropout=0.2):
        super().__init__()
        self.base_model = base_model
        self.config = base_model.config
        self.dropout = nn.Dropout(dropout)
        hidden_size = base_model.config.hidden_size
        self.main_head = nn.Linear(hidden_size, num_labels)
        self.aux_heads = nn.ModuleDict()
        if aux_tasks:
            for name, classes in aux_tasks.items():
                self.aux_heads[name] = nn.Linear(hidden_size, classes)
    def forward(self, input_ids, attention_mask=None, **kwargs):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        pooled = self.dropout(pooled)
        out = {'logits': self.main_head(pooled)}
        for name, head in self.aux_heads.items():
            out[name + '_logits'] = head(pooled)
        return out

class VertexAIJapaneseABSAPipeline:
    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(vertex_config, model_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)
        self.model = None
        self.tokenizer = None
        self.trainer = None

    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info("Generating training data...")
        df = self.data_generator.generate_training_data(num_samples)
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        logger.info(f"Generated {len(df)} training samples")
        return df

    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Extracting features for {len(df)} samples...")
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(pattern, na=False, regex=True).astype(int)
        df['text_length'] = df['review_text'].str.len()
        return df

    def prepare_aux_labels(self, df: pd.DataFrame):
        for emo in ['joy', 'sadness', 'anger']:
            df[f"high_{emo}"] = np.random.randint(0, 2, size=len(df))
        return df

    def prepare_datasets(self, df, tokenizer):
        def tokenize_function(examples):
            tok = tokenizer(
                examples['review_text'],
                truncation=True,
                padding='max_length',
                max_length=128
            )
            tok['labels'] = examples['sentiment']
            tok['joy_labels'] = examples['high_joy']
            tok['sadness_labels'] = examples['high_sadness']
            tok['anger_labels'] = examples['high_anger']
            return tok
        dataset = Dataset.from_pandas(df[['review_text', 'sentiment', 'high_joy', 'high_sadness', 'high_anger']])
        dataset = dataset.map(tokenize_function, batched=True)
        columns = ['input_ids', 'attention_mask', 'labels', 'joy_labels', 'sadness_labels', 'anger_labels']
        dataset.set_format(type='torch', columns=columns)
        return dataset

    def setup_model_and_tokenizer(self, model_name, num_labels, aux_tasks=None):
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        base_model = AutoModel.from_pretrained(model_name)
        model = AdvancedMultiTaskModel(base_model, num_labels=num_labels, aux_tasks=aux_tasks)
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )
        model = get_peft_model(model, lora_config)
        model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
        return tokenizer, model

    def train_evaluate_multillm_lora(self, df):
        df = self.prepare_aux_labels(df)
        train_df, test_df = train_test_split(df, test_size=self.model_config.test_size, stratify=df['sentiment'], random_state=42)
        self.insight_extractor.executive_summary(train_df)
        self.insight_extractor.generate_business_recommendations(train_df)
        num_labels = 3
        aux_tasks = {'joy': 2, 'sadness': 2, 'anger': 2}
        best_f1 = -1
        best_result = None
        for model_name in self.model_config.model_candidates:
            print(f"\n===== Training {model_name} =====")
            tokenizer, model = self.setup_model_and_tokenizer(model_name, num_labels, aux_tasks)
            train_dataset = self.prepare_datasets(train_df, tokenizer)
            test_dataset = self.prepare_datasets(test_df, tokenizer)
            class MultiTaskTrainer(Trainer):
                def compute_loss(self, model, inputs, *args, **kwargs):
                    labels = inputs.pop("labels")
                    joy_labels = inputs.pop("joy_labels")
                    sadness_labels = inputs.pop("sadness_labels")
                    anger_labels = inputs.pop("anger_labels")
                    outputs = model(**inputs)
                    loss = F.cross_entropy(outputs['logits'], labels)
                    loss += 0.2 * F.cross_entropy(outputs['joy_logits'], joy_labels)
                    loss += 0.2 * F.cross_entropy(outputs['sadness_logits'], sadness_labels)
                    loss += 0.2 * F.cross_entropy(outputs['anger_logits'], anger_labels)
                    return (loss, outputs) if kwargs.get('return_outputs', False) else loss
            training_args = TrainingArguments(
                output_dir="./results",
                num_train_epochs=2,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                evaluation_strategy="epoch",
                save_strategy="no",
                learning_rate=1e-4,
                logging_dir="./logs",
                logging_steps=100,
                fp16=torch.cuda.is_available(),
                report_to=[],
                dataloader_num_workers=0,
                max_grad_norm=1.0,
                remove_unused_columns=False,
                label_names=["labels", "joy_labels", "sadness_labels", "anger_labels"],
            )
            data_collator = DataCollatorWithPadding(tokenizer, padding=True, max_length=128)
            trainer = MultiTaskTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=data_collator,
                tokenizer=tokenizer,
            )
            trainer.train()
            preds = trainer.predict(test_dataset)
            y_pred = np.argmax(preds.predictions[0], axis=1)
            y_true = test_df['sentiment'].values
            report = classification_report(y_true, y_pred, output_dict=True, target_names=["negative", "neutral", "positive"])
            macro_f1 = report['macro avg']['f1-score']
            print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))
            if macro_f1 > best_f1:
                best_f1 = macro_f1
                best_result = {
                    "model_name": model_name,
                    "trainer": trainer,
                    "tokenizer": tokenizer,
                    "model": model,
                    "test_df": test_df,
                    "y_pred": y_pred,
                    "y_true": y_true
                }
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        print(f"\n===== Best model: {best_result['model_name']} (macro F1={best_f1:.4f}) =====")
        cm = confusion_matrix(best_result["y_true"], best_result["y_pred"])
        ConfusionMatrixDisplay(cm, display_labels=["neg", "neu", "pos"]).plot(cmap="Blues")
        plt.title(f"Confusion Matrix ({best_result['model_name']})")
        plt.tight_layout()
        plt.show()
        self.model = best_result['model']
        self.tokenizer = best_result['tokenizer']
        self.trainer = best_result['trainer']
        self.df_train = train_df # <- for RAG
        return best_result

    def pipeline(self, num_samples: int = 1000):
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df)
        best_result = self.train_evaluate_multillm_lora(df)
        print("\nPipeline finished! Best LLM+LoRA saved in memory.")
        return best_result

# ----------------- RAG Inference -----------------
class RAGInference:
    """ Retrieval-Augmented Generation for ABSA sentiment inference """
    def __init__(
        self, model, tokenizer, df_corpus, embedding_extractor, top_k=3
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.df_corpus = df_corpus.reset_index(drop=True)
        self.top_k = top_k
        self.embedding_extractor = embedding_extractor
        print("Building RAG vector index from corpus...")
        corpus_embeddings = self.embedding_extractor.get_embeddings(list(self.df_corpus['review_text']))
        self.index = faiss.IndexFlatIP(corpus_embeddings.shape[1])
        self.index.add(corpus_embeddings.astype(np.float32))
        self.corpus_embeddings = corpus_embeddings

    def retrieve(self, query):
        q_emb = self.embedding_extractor.get_embeddings([query]).astype(np.float32)
        D, I = self.index.search(q_emb, self.top_k)
        return [self.df_corpus.iloc[i]['review_text'] for i in I[0]]

    def predict(self, text):
        retrieved = self.retrieve(text)
        context = " ".join(retrieved)
        input_text = f"{text} [CONTEXT] {context}"
        tokens = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        tokens = {k: v.to(next(self.model.parameters()).device) for k, v in tokens.items()}
        with torch.no_grad():
            logits = self.model(**tokens)['logits']
        pred = logits.argmax(dim=-1).item()
        return pred, retrieved

# --------------- Main Entrypoint ---------------
if __name__ == "__main__":
    # Hardware: Vertex AI/Cloud/Local (config controls everything)
    vertex_config = VertexAIConfig(
        project_id="your-gcp-project",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        use_gpu=True,
        machine_type="n1-standard-4",
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()
    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    pipeline.pipeline(num_samples=1200)

    # ---- RAG inference (example) ----
    print("\n--- Initializing RAG inference wrapper with vector DB embeddings ---")
    rag = RAGInference(
        pipeline.model, pipeline.tokenizer,
        pipeline.df_train, pipeline.embedding_extractor, top_k=3
    )

    example_text = "この商品はとても便利ですが、品質が少し心配です。"
    pred, retrieved = rag.predict(example_text)
    print(f"\nRAG Sentiment prediction: {['negative','neutral','positive'][pred]}")
    print(f"Retrieved context:\n- " + "\n- ".join(retrieved))


In [None]:
# ==========================
# All imports & dataclasses
# ==========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import gc
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from transformers import (
    AutoModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset

from sentence_transformers import SentenceTransformer
import faiss

# Vertex AI
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

# ========== Logging & Styles ==========
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    project_id: str = "your-project-id"
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"
    model_candidates: list = None

    def __post_init__(self):
        if self.model_candidates is None:
            self.model_candidates = [
                "cl-tohoku/bert-base-japanese-whole-word-masking",
                "rinna/japanese-roberta-base",
                "studio-ousia/luke-japanese-base-lite",
                "xlm-roberta-base"
            ]

@dataclass
class AspectConfig:
    aspects: Dict[str, List[str]] = None
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

# =============================================
# Embedding & Data Generation (same as before)
# =============================================

class VertexAIEmbeddingExtractor:
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    def _initialize_vertex_ai(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(self.model_config.vertex_embedding_model)
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                embeddings = []
                batch_size = 5
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                return np.array(embeddings)
            else:
                return self.embedding_model.encode(
                    texts, show_progress_bar=True, batch_size=self.model_config.batch_size, normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    def _initialize_model(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        prompts = self._create_generation_prompts()
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            try:
                response = self.generative_model.generate_content(
                    prompt, generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        df = pd.DataFrame(generated_data)
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        df = pd.concat([df, manual_df], ignore_index=True)
        logger.info(f"Generated {len(df)} training samples")
        return df
    def _create_generation_prompts(self) -> List[str]:
        prompts = []
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.
Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}
Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality
Generate 20 similar reviews:
"""
                prompts.append(prompt)
        return prompts
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        samples = []
        lines = response_text.split('\n')
        current_review = None
        current_sentiment = None
        current_aspect = None
        for line in lines:
            line = line.strip()
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                if current_review and current_sentiment and current_aspect:
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    if len(samples) >= max_samples:
                        break
        return samples
    def _create_manual_samples(self) -> List[Dict]:
        manual_samples = [
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        return manual_samples

# =======================
# Analytics/Plotting/etc.
# (Same as before, not pasted for brevity)
# ...Keep your EnhancedBusinessInsightExtractor here
# =======================

# =======================
# LoRA MultiLLM pipeline
# (Same as before, not pasted for brevity)
# ...Keep AdvancedMultiTaskModel & VertexAIJapaneseABSAPipeline here
# =======================

# =======================
# RAG Retriever
# (Same as before, not pasted for brevity)
# ...Keep your RAGInference here
# =======================

# ======================================
# Gemini/PaLM2 RAG Pipeline Class
# ======================================

class GeminiRAGReranker:
    def __init__(self, retriever, gcp_project, gcp_location, model_name="gemini-pro"):
        vertexai.init(project=gcp_project, location=gcp_location)
        self.model = GenerativeModel(model_name)
        self.retriever = retriever

    def prompt(self, user_text, context_texts):
        ctx = "\n".join([f"{i+1}. {c}" for i, c in enumerate(context_texts)])
        prompt = f"""ユーザーレビュー: {user_text}
参考レビュー:
{ctx}
-----
このレビューを「ポジティブ」「ニュートラル」「ネガティブ」から分類し、その理由も日本語で簡潔に説明してください。
出力例: 
Sentiment: ポジティブ
Reason: ...理由..."""
        return prompt

    def predict(self, user_text, top_k=3):
        retrieved = self.retriever.retrieve(user_text, top_k=top_k)
        prompt = self.prompt(user_text, retrieved)
        response = self.model.generate_content(prompt)
        return response.text, retrieved

# ================
# Main Entrypoint
# ================

if __name__ == "__main__":
    # 1. Train local LoRA model as before
    vertex_config = VertexAIConfig(
        project_id="your-gcp-project",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        use_gpu=True,
        machine_type="n1-standard-4",
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()
    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    pipeline.pipeline(num_samples=1200)

    print("\n--- Initializing RAG inference with vector DB embeddings (local LoRA) ---")
    rag = RAGInference(
        pipeline.model, pipeline.tokenizer,
        pipeline.df_train, pipeline.embedding_extractor, top_k=3
    )

    example_text = "この商品はとても便利ですが、品質が少し心配です。"
    pred, retrieved = rag.predict(example_text)
    print(f"\n[Local LoRA RAG] Sentiment prediction: {['negative','neutral','positive'][pred]}")
    print(f"Retrieved context:\n- " + "\n- ".join(retrieved))

    # 2. RAG+Gemini pipeline (NO local model required)
    print("\n--- Initializing Gemini/VertexAI LLM RAG inference ---")
    # Use the same retriever as above (just needs .retrieve)
    gemini_rag = GeminiRAGReranker(
        retriever=rag,
        gcp_project=vertex_config.project_id,
        gcp_location=vertex_config.location,
        model_name="gemini-pro"  # or "chat-bison"
    )
    gemini_output, gemini_retrieved = gemini_rag.predict(example_text, top_k=3)
    print("\n[Gemini RAG] Response:")
    print(gemini_output)
    print("Retrieved context:")
    for c in gemini_retrieved:
        print("-", c)
