In [None]:
# Enhanced Japanese Aspect-Based Sentiment Analysis Pipeline with Vertex AI
# Requirements: google-cloud-aiplatform, sentence-transformers, xgboost, optuna, scikit-learn, matplotlib, seaborn, pandas, numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import json
import re
import os
import gc
import warnings
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime
import base64
import pickle

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic
from google.cloud.aiplatform.gapic.schema import predict
from google.cloud import storage
import vertexai
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

# Traditional ML imports
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    precision_recall_fscore_support, accuracy_score, f1_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    """Configuration for Vertex AI"""
    project_id: str = "your-project-id"  # Replace with your GCP project ID
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"  # Replace with your bucket
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None  # Optional service account
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    """Configuration for the ABSA model"""
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    optuna_trials: int = 30
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"

@dataclass
class AspectConfig:
    """Configuration for aspect categories"""
    aspects: Dict[str, List[str]] = None
    
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

class VertexAIEmbeddingExtractor:
    """Vertex AI embedding extraction"""
    
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    
    def _initialize_vertex_ai(self):
        """Initialize Vertex AI"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(
                self.model_config.vertex_embedding_model
            )
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            logger.info("Falling back to SentenceTransformer")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """Get embeddings from Vertex AI or fallback model"""
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                # Use Vertex AI embeddings
                embeddings = []
                batch_size = 5  # Vertex AI has rate limits
                
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                
                return np.array(embeddings)
            else:
                # Use SentenceTransformer as fallback
                return self.embedding_model.encode(
                    texts,
                    show_progress_bar=True,
                    batch_size=self.model_config.batch_size,
                    normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    """Generate training data using Vertex AI's generative models"""
    
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize generative model"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        """Generate training data using Vertex AI"""
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        
        # Create prompts for different aspects and sentiments
        prompts = self._create_generation_prompts()
        
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            
            try:
                # Generate text using Vertex AI
                response = self.generative_model.generate_content(
                    prompt,
                    generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                
                # Parse response and create samples
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
                
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(generated_data)
        
        # Add some manual examples to ensure quality
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        
        df = pd.concat([df, manual_df], ignore_index=True)
        
        logger.info(f"Generated {len(df)} training samples")
        return df
    
    def _create_generation_prompts(self) -> List[str]:
        """Create prompts for data generation"""
        prompts = []
        
        # Create prompts for each aspect and sentiment combination
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.

Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}

Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality

Generate 20 similar reviews:
"""
                prompts.append(prompt)
        
        return prompts
    
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        """Parse generated response into structured data"""
        samples = []
        lines = response_text.split('\n')
        
        current_review = None
        current_sentiment = None
        current_aspect = None
        
        for line in lines:
            line = line.strip()
            
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                
                # If we have all three components, create a sample
                if current_review and current_sentiment and current_aspect:
                    # Map sentiment to numeric
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    
                    # Reset for next sample
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    
                    if len(samples) >= max_samples:
                        break
        
        return samples
    
    def _create_manual_samples(self) -> List[Dict]:
        """Create high-quality manual samples"""
        manual_samples = [
            # Quality - Positive
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            
            # Quality - Negative
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            
            # Service - Positive
            {'review_text': 'スタッフの対応が素晴らしかったです。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': '親切で丁寧な接客に感謝します。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': 'サービスが良くて気持ちよく利用できました。', 'sentiment': 2, 'aspect': 'service'},
            
            # Service - Negative
            {'review_text': '店員の態度が悪くて不快でした。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': 'サービスの質が低くて残念です。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': '接客が雑で二度と来たくありません。', 'sentiment': 0, 'aspect': 'service'},
            
            # Price - Positive
            {'review_text': '価格が安くてお得感があります。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': 'コストパフォーマンスが良い商品です。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': '適正価格で満足しています。', 'sentiment': 2, 'aspect': 'price'},
            
            # Price - Negative
            {'review_text': '値段が高すぎて手が出ません。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': '価格設定が不適切だと思います。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': 'コストが高くて続けられません。', 'sentiment': 0, 'aspect': 'price'},
            
            # Neutral samples
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        
        # Add text_length and generated flag
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        
        return manual_samples

class EnhancedBusinessInsightExtractor:
    """Enhanced business insight extractor with better analytics"""
    
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {
            'negative': '#FF6B6B',
            'neutral': '#FFD93D', 
            'positive': '#6BCF7F'
        }
        
    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        """Calculate detailed metrics for each aspect"""
        metrics = {}
        
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
                
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
                
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
            
        return metrics
    
    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate actionable business recommendations"""
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        
        # Sort aspects by priority (negative rate * mentions)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
            
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            
            # Generate specific recommendations
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
            
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }
    
    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate executive summary with key metrics"""
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        
        # Calculate key metrics
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        
        avg_text_length = df['text_length'].mean()
        
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)  # Scale for display
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

class VertexAIJapaneseABSAPipeline:
    """Vertex AI Japanese ABSA Pipeline"""
    
    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config
        
        # Initialize components
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(vertex_config, model_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)
        
        # Model components
        self.model = None
        self.scaler = None
        self.feature_columns = None
        self.endpoint = None
        
        # Initialize Vertex AI
        self._initialize_vertex_ai()
    
    def _initialize_vertex_ai(self):
        """Initialize Vertex AI platform"""
        try:
            aiplatform.init(
                project=self.vertex_config.project_id,
                location=self.vertex_config.location,
                staging_bucket=self.vertex_config.staging_bucket
            )
            logger.info("Vertex AI initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Vertex AI: {e}")
            raise
    
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        """Generate training data"""
        logger.info("Generating training data...")
        
        # Generate data using Vertex AI
        df = self.data_generator.generate_training_data(num_samples)
        
        # Clean and validate data
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        
        # Add additional features
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        
        logger.info(f"Generated {len(df)} training samples")
        return df
    
    def extract_features(self, df: pd.DataFrame, fit_scaler: bool = False) -> pd.DataFrame:
        """Extract features from text data"""
        logger.info(f"Extracting features for {len(df)} samples...")
        
        # Extract aspect features
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(
                pattern, na=False, regex=True
            ).astype(int)
        
        # Extract embeddings using Vertex AI
        logger.info("Generating embeddings...")
        embeddings = self.embedding_extractor.get_embeddings(df['review_text'].tolist())
        
        # Add embedding features
        embedding_dim = embeddings.shape[1]
        for i in range(embedding_dim):
            df[f'emb_{i}'] = embeddings[:, i]
        
        # Scale embeddings
        if fit_scaler:
            self.scaler = StandardScaler()
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.fit_transform(df[emb_cols])
        elif self.scaler is not None:
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.transform(df[emb_cols])
        
        logger.info("Feature extraction completed")
        return df
    
    def prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        """Prepare features for training"""
        feature_cols = [
            col for col in df.columns 
            if col.startswith(('aspect_', 'emb_')) or col in ['text_length', 'word_count', 'exclamation_count', 'question_count']
        ]
        
        X = df[feature_cols].values
        y = df['sentiment'].values
        
        logger.info(f"Feature matrix shape: {X.shape}")
        logger.info(f"Target distribution: {np.bincount(y)}")
        
        return X, y, feature_cols
    
    def optimize_hyperparameters(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
        """Optimize hyperparameters using Optuna"""
        logger.info("Starting hyperparameter optimization...")
        
        def objective(trial):
            # XGBoost parameters
            params = {
                "objective": "multi:softprob",
                "num_class": 3,
                "eval_metric": "mlogloss",
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "gamma": trial.suggest_float("gamma", 0, 5),
                "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
                "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
                "use_label_encoder": False,
                "verbosity": 0,
                "random_state": self.model_config.random_state
            }
            
            # Cross-validation
            skf = StratifiedKFold(n_splits=self.model_config.n_splits, shuffle=True, 
                                 random_state=self.model_config.random_state)
            
            scores = []
            for train_idx, val_idx in skf.split(X, y):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
                
                # Train model
                model = xgb.XGBClassifier(**params)
                model.fit(X_train, y_train)
                
                # Predict and evaluate
                y_pred = model.predict(X_val)
                f1 = f1_score(y_val, y_pred, average='weighted')
                scores.append(f1)
            
            return np.mean(scores)
        
        # Run optimization
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=self.model_config.optuna_trials)
        
        logger.info(f"Best score: {study.best_value:.4f}")
        logger.info(f"Best params: {study.best_params}")
        
        return study.best_params, study
    
    def train_model(self, X: np.ndarray, y: np.ndarray, params: Dict[str, Any]) -> xgb.XGBClassifier:
        """Train XGBoost model"""
        logger.info("Training XGBoost model...")
        
        # Update parameters
        model_params = {
            "objective": "multi:softprob",
            "num_class": 3,
            "use_label_encoder": False,
            "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
            "random_state": self.model_config.random_state,
            **params
        }
        
        # Train model
        self.model = xgb.XGBClassifier(**model_params)
        self.model.fit(X, y)
        
        logger.info("Model training completed")
        return self.model
    
    def save_model_to_gcs(self, model_path: str = None) -> str:
        """Save model to Google Cloud Storage"""
        if model_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = f"absa_model_{timestamp}.pkl"
        
        # Create model artifact
        model_artifact = {
            'model': self.model,
            'scaler': self.scaler,
            'feature_columns': self.feature_columns,
            'vertex_config': self.vertex_config,
            'model_config': self.model_config,
            'aspect_config': self.aspect_config
        }
        
        # Save to local file first
        local_path = f"/tmp/{model_path}"
        with open(local_path, 'wb') as f:
            pickle.dump(model_artifact, f)
        
        # Upload to GCS
        bucket_name = self.vertex_config.staging_bucket.replace('gs://', '')
        storage_client = storage.Client(project=self.vertex_config.project_id)
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(f"absa_models/{model_path}")
        blob.upload_from_filename(local_path)
        logger.info(f"Model saved to GCS: gs://{bucket_name}/absa_models/{model_path}")
        return f"gs://{bucket_name}/absa_models/{model_path}"

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray, feature_columns: List[str]):
        logger.info("Evaluating model...")
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)

        # Metrics
        cr = classification_report(y_test, y_pred, digits=3, output_dict=True)
        logger.info("Classification report:\n" + classification_report(y_test, y_pred, digits=3))
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive'])
        disp.plot(cmap="Blues")
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.savefig("/tmp/absa_cm.png")
        plt.close()
        logger.info("Confusion matrix saved as /tmp/absa_cm.png")

        # Feature importance
        importances = self.model.feature_importances_
        fi = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
        plt.figure(figsize=(10,6))
        sns.barplot(x=fi.values[:20], y=fi.index[:20], palette='husl')
        plt.title("Top 20 Feature Importances")
        plt.tight_layout()
        plt.savefig("/tmp/absa_feature_importance.png")
        plt.close()
        logger.info("Feature importance plot saved as /tmp/absa_feature_importance.png")

        # SHAP
        try:
            import shap
            explainer = shap.TreeExplainer(self.model)
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test, feature_names=feature_columns, show=False)
            plt.tight_layout()
            plt.savefig("/tmp/absa_shap_summary.png")
            plt.close()
            logger.info("SHAP summary plot saved as /tmp/absa_shap_summary.png")
        except Exception as e:
            logger.warning(f"SHAP failed: {e}")

        return {
            "classification_report": cr,
            "confusion_matrix": cm,
            "feature_importance": fi,
            "plots": {
                "cm": "/tmp/absa_cm.png",
                "feature_importance": "/tmp/absa_feature_importance.png",
                "shap_summary": "/tmp/absa_shap_summary.png"
            }
        }

    def pipeline(self, num_samples: int = 1000):
        # 1. Data Generation
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df, fit_scaler=True)
        X, y, feature_columns = self.prepare_features(df)
        self.feature_columns = feature_columns

        # 2. Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.model_config.test_size, stratify=y, random_state=self.model_config.random_state
        )

        # 3. Hyperparameter tuning
        best_params, study = self.optimize_hyperparameters(X_train, y_train)

        # 4. Final training
        self.train_model(X_train, y_train, best_params)

        # 5. Evaluation
        results = self.evaluate(X_test, y_test, feature_columns)

        # 6. Business Insights
        df_test = df.iloc[y_test.index] if hasattr(y_test, "index") else df.iloc[:len(y_test)]
        df_test = df_test.copy()
        df_test['predicted_sentiment'] = self.model.predict(X_test)
        self.insight_extractor.executive_summary(df_test)
        self.insight_extractor.generate_business_recommendations(df_test)

        # 7. Save model
        model_uri = self.save_model_to_gcs()

        return {
            "model_uri": model_uri,
            "results": results,
            "optuna_study": study
        }


# --------------- Main Entrypoint ---------------

if __name__ == "__main__":
    # ==== Configure (customize these before running!) ====
    vertex_config = VertexAIConfig(
        project_id="able-balm-454718-n8",  # YOUR GCP PROJECT
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",  # YOUR BUCKET
        use_gpu=True
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        optuna_trials=30,
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()

    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    result = pipeline.pipeline(num_samples=10_000)

    print("\n\n=========================")
    print("Pipeline finished!")
    print(f"Model saved at: {result['model_uri']}")
    print("=========================")


In [None]:
# Enhanced Japanese Aspect-Based Sentiment Analysis Pipeline with Vertex AI
# Requirements: google-cloud-aiplatform, sentence-transformers, xgboost, optuna, scikit-learn, matplotlib, seaborn, pandas, numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import json
import re
import os
import gc
import warnings
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime
import base64
import pickle

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic
from google.cloud.aiplatform.gapic.schema import predict
from google.cloud import storage
import vertexai
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

# Traditional ML imports
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    precision_recall_fscore_support, accuracy_score, f1_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    """Configuration for Vertex AI"""
    project_id: str = "your-project-id"  # Replace with your GCP project ID
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"  # Replace with your bucket
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None  # Optional service account
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    """Configuration for the ABSA model"""
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    optuna_trials: int = 30
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"

@dataclass
class AspectConfig:
    """Configuration for aspect categories"""
    aspects: Dict[str, List[str]] = None
    
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

class VertexAIEmbeddingExtractor:
    """Vertex AI embedding extraction"""
    
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    
    def _initialize_vertex_ai(self):
        """Initialize Vertex AI"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(
                self.model_config.vertex_embedding_model
            )
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            logger.info("Falling back to SentenceTransformer")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """Get embeddings from Vertex AI or fallback model"""
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                # Use Vertex AI embeddings
                embeddings = []
                batch_size = 5  # Vertex AI has rate limits
                
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                
                return np.array(embeddings)
            else:
                # Use SentenceTransformer as fallback
                return self.embedding_model.encode(
                    texts,
                    show_progress_bar=True,
                    batch_size=self.model_config.batch_size,
                    normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    """Generate training data using Vertex AI's generative models"""
    
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize generative model"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        """Generate training data using Vertex AI"""
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        
        # Create prompts for different aspects and sentiments
        prompts = self._create_generation_prompts()
        
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            
            try:
                # Generate text using Vertex AI
                response = self.generative_model.generate_content(
                    prompt,
                    generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                
                # Parse response and create samples
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
                
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(generated_data)
        
        # Add some manual examples to ensure quality
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        
        df = pd.concat([df, manual_df], ignore_index=True)
        
        logger.info(f"Generated {len(df)} training samples")
        return df
    
    def _create_generation_prompts(self) -> List[str]:
        """Create prompts for data generation"""
        prompts = []
        
        # Create prompts for each aspect and sentiment combination
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.

Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}

Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality

Generate 20 similar reviews:
"""
                prompts.append(prompt)
        
        return prompts
    
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        """Parse generated response into structured data"""
        samples = []
        lines = response_text.split('\n')
        
        current_review = None
        current_sentiment = None
        current_aspect = None
        
        for line in lines:
            line = line.strip()
            
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                
                # If we have all three components, create a sample
                if current_review and current_sentiment and current_aspect:
                    # Map sentiment to numeric
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    
                    # Reset for next sample
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    
                    if len(samples) >= max_samples:
                        break
        
        return samples
    
    def _create_manual_samples(self) -> List[Dict]:
        """Create high-quality manual samples"""
        manual_samples = [
            # Quality - Positive
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            
            # Quality - Negative
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            
            # Service - Positive
            {'review_text': 'スタッフの対応が素晴らしかったです。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': '親切で丁寧な接客に感謝します。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': 'サービスが良くて気持ちよく利用できました。', 'sentiment': 2, 'aspect': 'service'},
            
            # Service - Negative
            {'review_text': '店員の態度が悪くて不快でした。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': 'サービスの質が低くて残念です。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': '接客が雑で二度と来たくありません。', 'sentiment': 0, 'aspect': 'service'},
            
            # Price - Positive
            {'review_text': '価格が安くてお得感があります。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': 'コストパフォーマンスが良い商品です。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': '適正価格で満足しています。', 'sentiment': 2, 'aspect': 'price'},
            
            # Price - Negative
            {'review_text': '値段が高すぎて手が出ません。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': '価格設定が不適切だと思います。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': 'コストが高くて続けられません。', 'sentiment': 0, 'aspect': 'price'},
            
            # Neutral samples
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        
        # Add text_length and generated flag
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        
        return manual_samples

class EnhancedBusinessInsightExtractor:
    """Enhanced business insight extractor with better analytics"""
    
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {
            'negative': '#FF6B6B',
            'neutral': '#FFD93D', 
            'positive': '#6BCF7F'
        }
        
    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        """Calculate detailed metrics for each aspect"""
        metrics = {}
        
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
                
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
                
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
            
        return metrics
    
    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate actionable business recommendations"""
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        
        # Sort aspects by priority (negative rate * mentions)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
            
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            
            # Generate specific recommendations
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
            
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }
    
    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate executive summary with key metrics"""
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        
        # Calculate key metrics
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        
        avg_text_length = df['text_length'].mean()
        
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)  # Scale for display
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

class VertexAIJapaneseABSAPipeline:
    """Vertex AI Japanese ABSA Pipeline"""

    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config

        # Initialize components
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(vertex_config, model_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)

        # Model components
        self.model = None
        self.scaler = None
        self.feature_columns = None
        self.endpoint = None

        # Initialize Vertex AI
        self._initialize_vertex_ai()

    def _initialize_vertex_ai(self):
        try:
            aiplatform.init(
                project=self.vertex_config.project_id,
                location=self.vertex_config.location,
                staging_bucket=self.vertex_config.staging_bucket
            )
            logger.info("Vertex AI initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Vertex AI: {e}")
            raise

    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info("Generating training data...")
        df = self.data_generator.generate_training_data(num_samples)
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        logger.info(f"Generated {len(df)} training samples")
        return df

    def extract_features(self, df: pd.DataFrame, fit_scaler: bool = False) -> pd.DataFrame:
        logger.info(f"Extracting features for {len(df)} samples...")
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(pattern, na=False, regex=True).astype(int)
        logger.info("Generating embeddings...")
        embeddings = self.embedding_extractor.get_embeddings(df['review_text'].tolist())
        embedding_dim = embeddings.shape[1]
        for i in range(embedding_dim):
            df[f'emb_{i}'] = embeddings[:, i]
        if fit_scaler:
            self.scaler = StandardScaler()
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.fit_transform(df[emb_cols])
        elif self.scaler is not None:
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.transform(df[emb_cols])
        logger.info("Feature extraction completed")
        return df

    def prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        feature_cols = [
            col for col in df.columns
            if col.startswith(('aspect_', 'emb_')) or col in ['text_length', 'word_count', 'exclamation_count', 'question_count']
        ]
        X = df[feature_cols].values
        y = df['sentiment'].values
        logger.info(f"Feature matrix shape: {X.shape}")
        logger.info(f"Target distribution: {np.bincount(y)}")
        return X, y, feature_cols

    def optimize_hyperparameters(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
        logger.info("Starting hyperparameter optimization...")

        def objective(trial):
            params = {
                "objective": "multi:softprob",
                "num_class": 3,
                "eval_metric": "mlogloss",
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "gamma": trial.suggest_float("gamma", 0, 5),
                "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
                "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
                "use_label_encoder": False,
                "verbosity": 0,
                "random_state": self.model_config.random_state
            }
            skf = StratifiedKFold(n_splits=self.model_config.n_splits, shuffle=True,
                                  random_state=self.model_config.random_state)
            scores = []
            for train_idx, val_idx in skf.split(X, y):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
                model = xgb.XGBClassifier(**params)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                f1 = f1_score(y_val, y_pred, average='weighted')
                scores.append(f1)
            return np.mean(scores)

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=self.model_config.optuna_trials)
        logger.info(f"Best score: {study.best_value:.4f}")
        logger.info(f"Best params: {study.best_params}")
        return study.best_params, study

    def train_model(self, X: np.ndarray, y: np.ndarray, params: Dict[str, Any]) -> xgb.XGBClassifier:
        logger.info("Training XGBoost model...")
        model_params = {
            "objective": "multi:softprob",
            "num_class": 3,
            "use_label_encoder": False,
            "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
            "random_state": self.model_config.random_state,
            **params
        }
        self.model = xgb.XGBClassifier(**model_params)
        self.model.fit(X, y)
        logger.info("Model training completed")
        return self.model

    def save_model_to_gcs(self, model_path: str = None) -> str:
        if model_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = f"absa_model_{timestamp}.pkl"
        model_artifact = {
            'model': self.model,
            'scaler': self.scaler,
            'feature_columns': self.feature_columns,
            'vertex_config': self.vertex_config,
            'model_config': self.model_config,
            'aspect_config': self.aspect_config
        }
        local_path = f"/tmp/{model_path}"
        with open(local_path, 'wb') as f:
            pickle.dump(model_artifact, f)
        bucket_name = self.vertex_config.staging_bucket.replace('gs://', '')
        storage_client = storage.Client(project=self.vertex_config.project_id)
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(f"absa_models/{model_path}")
        blob.upload_from_filename(local_path)
        logger.info(f"Model saved to GCS: gs://{bucket_name}/absa_models/{model_path}")
        return f"gs://{bucket_name}/absa_models/{model_path}"

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray, feature_columns: List[str]):
        logger.info("Evaluating model...")
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)
        cr = classification_report(y_test, y_pred, digits=3, output_dict=True)
        logger.info("Classification report:\n" + classification_report(y_test, y_pred, digits=3))
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive'])
        disp.plot(cmap="Blues")
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.savefig("/tmp/absa_cm.png")
        plt.close()
        logger.info("Confusion matrix saved as /tmp/absa_cm.png")
        importances = self.model.feature_importances_
        fi = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
        plt.figure(figsize=(10,6))
        sns.barplot(x=fi.values[:20], y=fi.index[:20], palette='husl')
        plt.title("Top 20 Feature Importances")
        plt.tight_layout()
        plt.savefig("/tmp/absa_feature_importance.png")
        plt.close()
        logger.info("Feature importance plot saved as /tmp/absa_feature_importance.png")
        try:
            import shap
            explainer = shap.TreeExplainer(self.model)
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test, feature_names=feature_columns, show=False)
            plt.tight_layout()
            plt.savefig("/tmp/absa_shap_summary.png")
            plt.close()
            logger.info("SHAP summary plot saved as /tmp/absa_shap_summary.png")
        except Exception as e:
            logger.warning(f"SHAP failed: {e}")
        return {
            "classification_report": cr,
            "confusion_matrix": cm,
            "feature_importance": fi,
            "plots": {
                "cm": "/tmp/absa_cm.png",
                "feature_importance": "/tmp/absa_feature_importance.png",
                "shap_summary": "/tmp/absa_shap_summary.png"
            }
        }

    def cross_validate(self, X, y, feature_columns, n_splits=5):
        print("\n==========================")
        print(f"Starting {n_splits}-Fold Cross-Validation")
        print("==========================")
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=self.model_config.random_state)
        all_scores = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            model = xgb.XGBClassifier(
                objective='multi:softprob',
                num_class=3,
                use_label_encoder=False,
                tree_method='gpu_hist' if self.vertex_config.use_gpu else 'auto',
                random_state=self.model_config.random_state
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred, average='weighted')
            acc = accuracy_score(y_val, y_pred)
            report = classification_report(y_val, y_pred, digits=3)
            print(f"\n--- Fold {fold} ---")
            print(f"Weighted F1: {f1:.4f}  |  Accuracy: {acc:.4f}")
            print(report)
            all_scores.append((acc, f1))
        mean_acc = np.mean([x[0] for x in all_scores])
        mean_f1 = np.mean([x[1] for x in all_scores])
        print("\n==========================")
        print(f"CV Mean Accuracy: {mean_acc:.4f}")
        print(f"CV Mean Weighted F1: {mean_f1:.4f}")
        print("==========================\n")

    def pipeline(self, num_samples: int = 1000):
        # 1. Data Generation
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df, fit_scaler=True)
        X, y, feature_columns = self.prepare_features(df)
        self.feature_columns = feature_columns

        # 2. Cross Validation (CV=5)
        self.cross_validate(X, y, feature_columns, n_splits=self.model_config.n_splits)

        # 3. Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.model_config.test_size, stratify=y, random_state=self.model_config.random_state
        )

        # 4. Hyperparameter tuning
        best_params, study = self.optimize_hyperparameters(X_train, y_train)

        # 5. Final training
        self.train_model(X_train, y_train, best_params)

        # 6. Evaluation
        results = self.evaluate(X_test, y_test, feature_columns)

        # 7. Business Insights
        df_test = df.iloc[y_test.index] if hasattr(y_test, "index") else df.iloc[:len(y_test)]
        df_test = df_test.copy()
        df_test['predicted_sentiment'] = self.model.predict(X_test)
        self.insight_extractor.executive_summary(df_test)
        self.insight_extractor.generate_business_recommendations(df_test)

        # 8. Save model
        model_uri = self.save_model_to_gcs()

        return {
            "model_uri": model_uri,
            "results": results,
            "optuna_study": study
        }

# --------------- Main Entrypoint ---------------

if __name__ == "__main__":
    # ==== Configure (customize these before running!) ====
    vertex_config = VertexAIConfig(
        project_id="able-balm-454718-n8",  # YOUR GCP PROJECT
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",  # YOUR BUCKET
        use_gpu=True
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        optuna_trials=30,
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()

    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    result = pipeline.pipeline(num_samples=1200)

    print("\n\n=========================")
    print("Pipeline finished!")
    print(f"Model saved at: {result['model_uri']}")
    print("=========================")


In [None]:
# Enhanced Japanese Aspect-Based Sentiment Analysis Pipeline with Vertex AI Endpoints
# Requirements: google-cloud-aiplatform, xgboost, optuna, sentence-transformers, scikit-learn, matplotlib, seaborn, pandas, numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import json
import re
import os
import gc
import warnings
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime
import base64
import pickle

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic import EndpointServiceClient
from google.cloud import storage
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

# ML imports
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    precision_recall_fscore_support, accuracy_score, f1_score
)
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sentence_transformers import SentenceTransformer

# Plotting and logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    project_id: str = "your-project-id"
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True

@dataclass
class ModelConfig:
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    optuna_trials: int = 30
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"

@dataclass
class AspectConfig:
    aspects: Dict[str, List[str]] = None
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

class VertexAIEmbeddingExtractor:
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    def _initialize_vertex_ai(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(
                self.model_config.vertex_embedding_model
            )
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            logger.info("Falling back to SentenceTransformer")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    def get_embeddings(self, texts: list) -> np.ndarray:
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                embeddings = []
                batch_size = 5
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                return np.array(embeddings)
            else:
                return self.embedding_model.encode(
                    texts,
                    show_progress_bar=True,
                    batch_size=self.model_config.batch_size,
                    normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    def _initialize_model(self):
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        prompts = self._create_generation_prompts()
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            try:
                response = self.generative_model.generate_content(
                    prompt,
                    generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        df = pd.DataFrame(generated_data)
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        df = pd.concat([df, manual_df], ignore_index=True)
        logger.info(f"Generated {len(df)} training samples")
        return df
    def _create_generation_prompts(self) -> list:
        prompts = []
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.
Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}
Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality
Generate 20 similar reviews:
"""
                prompts.append(prompt)
        return prompts
    def _parse_generated_response(self, response_text: str, max_samples: int) -> list:
        samples = []
        lines = response_text.split('\n')
        current_review = None
        current_sentiment = None
        current_aspect = None
        for line in lines:
            line = line.strip()
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                if current_review and current_sentiment and current_aspect:
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    if len(samples) >= max_samples:
                        break
        return samples
    def _create_manual_samples(self) -> list:
        manual_samples = [
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        return manual_samples

class EnhancedBusinessInsightExtractor:
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {
            'negative': '#FF6B6B',
            'neutral': '#FFD93D', 
            'positive': '#6BCF7F'
        }
    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        metrics = {}
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
        return metrics
    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }
    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        avg_text_length = df['text_length'].mean()
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

class VertexAIJapaneseABSAPipeline:
    def __init__(self, vertex_config: VertexAIConfig, model_config: ModelConfig, aspect_config: AspectConfig):
        self.vertex_config = vertex_config
        self.model_config = model_config
        self.aspect_config = aspect_config
        self.data_generator = VertexAIDataGenerator(vertex_config, aspect_config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(vertex_config, model_config)
        self.insight_extractor = EnhancedBusinessInsightExtractor(aspect_config)
        self.model = None
        self.scaler = None
        self.feature_columns = None
        self.endpoint = None
        self._initialize_vertex_ai()

    def _initialize_vertex_ai(self):
        try:
            aiplatform.init(
                project=self.vertex_config.project_id,
                location=self.vertex_config.location,
                staging_bucket=self.vertex_config.staging_bucket
            )
            logger.info("Vertex AI initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Vertex AI: {e}")
            raise

    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        logger.info("Generating training data...")
        df = self.data_generator.generate_training_data(num_samples)
        df = df.dropna(subset=['review_text'])
        df = df[df['review_text'].str.len() > 5]
        df = df[df['review_text'].str.len() <= self.model_config.max_text_length]
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        logger.info(f"Generated {len(df)} training samples")
        return df

    def extract_features(self, df: pd.DataFrame, fit_scaler: bool = False) -> pd.DataFrame:
        logger.info(f"Extracting features for {len(df)} samples...")
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(pattern, na=False, regex=True).astype(int)
        logger.info("Generating embeddings...")
        embeddings = self.embedding_extractor.get_embeddings(df['review_text'].tolist())
        embedding_dim = embeddings.shape[1]
        for i in range(embedding_dim):
            df[f'emb_{i}'] = embeddings[:, i]
        if fit_scaler:
            self.scaler = StandardScaler()
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.fit_transform(df[emb_cols])
        elif self.scaler is not None:
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.transform(df[emb_cols])
        logger.info("Feature extraction completed")
        return df

    def prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, list]:
        feature_cols = [
            col for col in df.columns
            if col.startswith(('aspect_', 'emb_')) or col in ['text_length', 'word_count', 'exclamation_count', 'question_count']
        ]
        X = df[feature_cols].values
        y = df['sentiment'].values
        logger.info(f"Feature matrix shape: {X.shape}")
        logger.info(f"Target distribution: {np.bincount(y)}")
        return X, y, feature_cols

    def optimize_hyperparameters(self, X: np.ndarray, y: np.ndarray) -> Tuple[dict, Any]:
        logger.info("Starting hyperparameter optimization...")
        def objective(trial):
            params = {
                "objective": "multi:softprob",
                "num_class": 3,
                "eval_metric": "mlogloss",
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "gamma": trial.suggest_float("gamma", 0, 5),
                "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
                "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
                "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
                "use_label_encoder": False,
                "verbosity": 0,
                "random_state": self.model_config.random_state
            }
            skf = StratifiedKFold(n_splits=self.model_config.n_splits, shuffle=True,
                                  random_state=self.model_config.random_state)
            scores = []
            for train_idx, val_idx in skf.split(X, y):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
                model = xgb.XGBClassifier(**params)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                f1 = f1_score(y_val, y_pred, average='weighted')
                scores.append(f1)
            return np.mean(scores)
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=self.model_config.optuna_trials)
        logger.info(f"Best score: {study.best_value:.4f}")
        logger.info(f"Best params: {study.best_params}")
        return study.best_params, study

    def train_model(self, X: np.ndarray, y: np.ndarray, params: dict) -> xgb.XGBClassifier:
        logger.info("Training XGBoost model...")
        model_params = {
            "objective": "multi:softprob",
            "num_class": 3,
            "use_label_encoder": False,
            "tree_method": "gpu_hist" if self.vertex_config.use_gpu else "auto",
            "random_state": self.model_config.random_state,
            **params
        }
        self.model = xgb.XGBClassifier(**model_params)
        self.model.fit(X, y)
        logger.info("Model training completed")
        return self.model

    def save_model_to_gcs(self, model_path: str = None) -> str:
        if model_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = f"absa_model_{timestamp}.pkl"
        model_artifact = {
            'model': self.model,
            'scaler': self.scaler,
            'feature_columns': self.feature_columns,
            'vertex_config': self.vertex_config,
            'model_config': self.model_config,
            'aspect_config': self.aspect_config
        }
        local_path = f"/tmp/{model_path}"
        with open(local_path, 'wb') as f:
            pickle.dump(model_artifact, f)
        bucket_name = self.vertex_config.staging_bucket.replace('gs://', '')
        storage_client = storage.Client(project=self.vertex_config.project_id)
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(f"absa_models/{model_path}")
        blob.upload_from_filename(local_path)
        logger.info(f"Model saved to GCS: gs://{bucket_name}/absa_models/{model_path}")
        return f"gs://{bucket_name}/absa_models/{model_path}"

    def deploy_model_to_vertex_ai(self, model_uri: str) -> str:
        display_name = self.vertex_config.model_display_name + "-" + datetime.now().strftime("%m%d-%H%M%S")
        # Container for XGBoost 1.7+ (prebuilt container)
        model = aiplatform.Model.upload(
            display_name=display_name,
            artifact_uri=model_uri,
            serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-7:latest",
            project=self.vertex_config.project_id,
            location=self.vertex_config.location,
            sync=True
        )
        logger.info(f"Model registered in Vertex AI: {model.resource_name}")
        endpoint = model.deploy(
            machine_type=self.vertex_config.machine_type,
            accelerator_type=self.vertex_config.accelerator_type if self.vertex_config.use_gpu else None,
            accelerator_count=self.vertex_config.accelerator_count if self.vertex_config.use_gpu else None,
            traffic_split={"0": 100}
        )
        logger.info(f"Model deployed to endpoint: {endpoint.resource_name}")
        self.endpoint = endpoint
        return endpoint.resource_name

    def predict_with_endpoint(self, endpoint_name: str, instances: np.ndarray, feature_columns: list) -> np.ndarray:
        endpoint = aiplatform.Endpoint(endpoint_name)
        instance_dicts = [
            {col: float(val) for col, val in zip(feature_columns, row)}
            for row in instances
        ]
        predictions = endpoint.predict(instances=instance_dicts)
        y_pred = np.argmax(predictions.predictions, axis=1)
        return y_pred

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray, feature_columns: list, use_endpoint: bool = False, endpoint_name: str = None):
        logger.info("Evaluating model...")
        if use_endpoint and endpoint_name is not None:
            y_pred = self.predict_with_endpoint(endpoint_name, X_test, feature_columns)
        else:
            y_pred = self.model.predict(X_test)
        cr = classification_report(y_test, y_pred, digits=3, output_dict=True)
        logger.info("Classification report:\n" + classification_report(y_test, y_pred, digits=3))
        cm = confusion_matrix(y_test, y_pred)
        ConfusionMatrixDisplay(cm, display_labels=['negative', 'neutral', 'positive']).plot(cmap="Blues")
        plt.title("Confusion Matrix")
        plt.tight_layout()
        plt.savefig("/tmp/absa_cm.png")
        plt.close()
        return {
            "classification_report": cr,
            "confusion_matrix": cm,
            "plots": {
                "cm": "/tmp/absa_cm.png"
            }
        }

    def pipeline(self, num_samples: int = 1000):
        df = self.generate_training_data(num_samples)
        df = self.extract_features(df, fit_scaler=True)
        X, y, feature_columns = self.prepare_features(df)
        self.feature_columns = feature_columns
        self.insight_extractor.executive_summary(df)
        self.insight_extractor.generate_business_recommendations(df)

        # 2. Cross-validation
        self.cross_validate(X, y, feature_columns, n_splits=self.model_config.n_splits)

        # 3. Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.model_config.test_size, stratify=y, random_state=self.model_config.random_state
        )

        # 4. Hyperparameter tuning
        best_params, study = self.optimize_hyperparameters(X_train, y_train)

        # 5. Final training
        self.train_model(X_train, y_train, best_params)

        # 6. Save model
        model_uri = self.save_model_to_gcs()

        # 7. Deploy to Vertex AI endpoint
        endpoint_name = self.deploy_model_to_vertex_ai(model_uri)

        # 8. Evaluate via endpoint
        results = self.evaluate(X_test, y_test, feature_columns, use_endpoint=True, endpoint_name=endpoint_name)

        return {
            "model_uri": model_uri,
            "endpoint_name": endpoint_name,
            "results": results,
            "optuna_study": study
        }

    def cross_validate(self, X, y, feature_columns, n_splits=5):
        print("\n==========================")
        print(f"Starting {n_splits}-Fold Cross-Validation")
        print("==========================")
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=self.model_config.random_state)
        all_scores = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            model = xgb.XGBClassifier(
                objective='multi:softprob',
                num_class=3,
                use_label_encoder=False,
                tree_method='gpu_hist' if self.vertex_config.use_gpu else 'auto',
                random_state=self.model_config.random_state
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred, average='weighted')
            acc = accuracy_score(y_val, y_pred)
            report = classification_report(y_val, y_pred, digits=3)
            print(f"\n--- Fold {fold} ---")
            print(f"Weighted F1: {f1:.4f}  |  Accuracy: {acc:.4f}")
            print(report)
            all_scores.append((acc, f1))
        mean_acc = np.mean([x[0] for x in all_scores])
        mean_f1 = np.mean([x[1] for x in all_scores])
        print("\n==========================")
        print(f"CV Mean Accuracy: {mean_acc:.4f}")
        print(f"CV Mean Weighted F1: {mean_f1:.4f}")
        print("==========================\n")

# --------- Entrypoint ---------
if __name__ == "__main__":
    vertex_config = VertexAIConfig(
        project_id="able-balm-454718-n8",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        use_gpu=True
    )
    model_config = ModelConfig(
        n_splits=5,
        embedding_model_name="intfloat/multilingual-e5-base",
        optuna_trials=30,
        test_size=0.2,
        random_state=42,
        batch_size=64,
        max_text_length=256,
        vertex_embedding_model="textembedding-gecko-multilingual@001"
    )
    aspect_config = AspectConfig()
    pipeline = VertexAIJapaneseABSAPipeline(vertex_config, model_config, aspect_config)
    result = pipeline.pipeline(num_samples=1200)
    print("\n\n=========================")
    print("Pipeline finished!")
    print(f"Model saved at: {result['model_uri']}")
    print(f"Model endpoint deployed at: {result['endpoint_name']}")
    print("=========================")


In [None]:
# Enhanced Japanese Aspect-Based Sentiment Analysis Pipeline with Vertex AI Deployment
# Requirements: google-cloud-aiplatform, sentence-transformers, xgboost, optuna, scikit-learn, matplotlib, seaborn, pandas, numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import json
import re
import os
import gc
import warnings
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime
import base64
import pickle
import tempfile
import joblib

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic
from google.cloud.aiplatform.gapic.schema import predict
from google.cloud import storage
import vertexai
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from vertexai.preview.generative_models import GenerativeModel

# Traditional ML imports
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    precision_recall_fscore_support, accuracy_score, f1_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

@dataclass
class VertexAIConfig:
    """Configuration for Vertex AI"""
    project_id: str = "your-project-id"  # Replace with your GCP project ID
    location: str = "us-central1"
    staging_bucket: str = "gs://your-bucket-name"  # Replace with your bucket
    model_display_name: str = "japanese-absa-model"
    endpoint_display_name: str = "japanese-absa-endpoint"
    service_account: str = None  # Optional service account
    machine_type: str = "n1-standard-4"
    accelerator_type: str = "NVIDIA_TESLA_T4"
    accelerator_count: int = 1
    use_gpu: bool = True
    min_replica_count: int = 1
    max_replica_count: int = 3
    explanation_config: bool = True

@dataclass
class ModelConfig:
    """Configuration for the ABSA model"""
    n_splits: int = 5
    embedding_model_name: str = "intfloat/multilingual-e5-base"
    optuna_trials: int = 30
    test_size: float = 0.2
    random_state: int = 42
    batch_size: int = 64
    max_text_length: int = 512
    vertex_embedding_model: str = "textembedding-gecko-multilingual@001"

@dataclass
class AspectConfig:
    """Configuration for aspect categories"""
    aspects: Dict[str, List[str]] = None
    
    def __post_init__(self):
        if self.aspects is None:
            self.aspects = {
                'quality': ['品質', '質', '良い', '悪い', '高品質', '低品質', 'クオリティ', '品質管理'],
                'service': ['サービス', '対応', '接客', '親切', '丁寧', '態度', 'スタッフ', '店員'],
                'price': ['価格', '値段', '料金', '安い', '高い', 'コスト', '費用', '価格設定'],
                'convenience': ['便利', '不便', '簡単', '難しい', '使いやすい', '使いにくい', 'アクセス'],
                'speed': ['速い', '遅い', '早い', 'スピード', '迅速', '時間', '待ち時間'],
                'atmosphere': ['雰囲気', '環境', '空間', '居心地', '快適', '不快', '清潔'],
                'taste': ['味', '美味しい', 'まずい', '美味', '風味', '食感', '新鮮'],
                'design': ['デザイン', '見た目', '外観', 'おしゃれ', 'かっこいい', '美しい']
            }

class VertexAICustomPredictor:
    """Custom predictor for Vertex AI endpoint"""
    
    def __init__(self, model_artifact_path: str):
        self.model_artifact_path = model_artifact_path
        self.model = None
        self.scaler = None
        self.feature_columns = None
        self.vertex_config = None
        self.model_config = None
        self.aspect_config = None
        self.embedding_extractor = None
        self._load_model()
    
    def _load_model(self):
        """Load model artifacts"""
        try:
            with open(self.model_artifact_path, 'rb') as f:
                artifacts = pickle.load(f)
                
            self.model = artifacts['model']
            self.scaler = artifacts['scaler']
            self.feature_columns = artifacts['feature_columns']
            self.vertex_config = artifacts['vertex_config']
            self.model_config = artifacts['model_config']
            self.aspect_config = artifacts['aspect_config']
            
            # Initialize embedding extractor
            self.embedding_extractor = VertexAIEmbeddingExtractor(
                self.vertex_config, self.model_config
            )
            
            logger.info("Model artifacts loaded successfully")
            
        except Exception as e:
            logger.error(f"Failed to load model artifacts: {e}")
            raise
    
    def predict(self, instances: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Predict sentiment and aspects for input instances"""
        try:
            # Extract texts from instances
            texts = [instance.get('text', '') for instance in instances]
            
            # Create DataFrame
            df = pd.DataFrame({'review_text': texts})
            
            # Extract features
            df = self._extract_features(df)
            
            # Prepare features
            X = df[self.feature_columns].values
            
            # Make predictions
            predictions = self.model.predict(X)
            probabilities = self.model.predict_proba(X)
            
            # Format results
            results = []
            label_map = {0: "negative", 1: "neutral", 2: "positive"}
            
            for i, (pred, proba) in enumerate(zip(predictions, probabilities)):
                # Detect aspects
                aspects = self._detect_aspects(texts[i])
                
                result = {
                    'predicted_sentiment': label_map[pred],
                    'sentiment_score': int(pred),
                    'confidence': float(np.max(proba)),
                    'probabilities': {
                        'negative': float(proba[0]),
                        'neutral': float(proba[1]),
                        'positive': float(proba[2])
                    },
                    'detected_aspects': aspects,
                    'text': texts[i]
                }
                results.append(result)
            
            return results
            
        except Exception as e:
            logger.error(f"Prediction error: {e}")
            raise
    
    def _extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract features from text data"""
        # Add text statistics
        df['text_length'] = df['review_text'].str.len()
        df['word_count'] = df['review_text'].str.split().str.len()
        df['exclamation_count'] = df['review_text'].str.count('!')
        df['question_count'] = df['review_text'].str.count('?')
        
        # Extract aspect features
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            df[f'aspect_{aspect}'] = df['review_text'].str.contains(
                pattern, na=False, regex=True
            ).astype(int)
        
        # Generate embeddings
        embeddings = self.embedding_extractor.get_embeddings(df['review_text'].tolist())
        embedding_dim = embeddings.shape[1]
        
        for i in range(embedding_dim):
            df[f'emb_{i}'] = embeddings[:, i]
        
        # Scale embeddings
        if self.scaler is not None:
            emb_cols = [f'emb_{i}' for i in range(embedding_dim)]
            df[emb_cols] = self.scaler.transform(df[emb_cols])
        
        return df
    
    def _detect_aspects(self, text: str) -> List[str]:
        """Detect aspects mentioned in text"""
        detected_aspects = []
        
        for aspect, keywords in self.aspect_config.aspects.items():
            pattern = '|'.join([re.escape(kw) for kw in keywords])
            if re.search(pattern, text):
                detected_aspects.append(aspect)
        
        return detected_aspects

class VertexAIEmbeddingExtractor:
    """Vertex AI embedding extraction"""
    
    def __init__(self, config: VertexAIConfig, model_config: ModelConfig):
        self.config = config
        self.model_config = model_config
        self.embedding_model = None
        self._initialize_vertex_ai()
    
    def _initialize_vertex_ai(self):
        """Initialize Vertex AI"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.embedding_model = TextEmbeddingModel.from_pretrained(
                self.model_config.vertex_embedding_model
            )
            logger.info(f"Initialized Vertex AI embedding model: {self.model_config.vertex_embedding_model}")
        except Exception as e:
            logger.warning(f"Failed to initialize Vertex AI embeddings: {e}")
            logger.info("Falling back to SentenceTransformer")
            self.embedding_model = SentenceTransformer(self.model_config.embedding_model_name)
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """Get embeddings from Vertex AI or fallback model"""
        try:
            if isinstance(self.embedding_model, TextEmbeddingModel):
                # Use Vertex AI embeddings
                embeddings = []
                batch_size = 5  # Vertex AI has rate limits
                
                for i in range(0, len(texts), batch_size):
                    batch = texts[i:i + batch_size]
                    batch_embeddings = self.embedding_model.get_embeddings(batch)
                    embeddings.extend([emb.values for emb in batch_embeddings])
                
                return np.array(embeddings)
            else:
                # Use SentenceTransformer as fallback
                return self.embedding_model.encode(
                    texts,
                    show_progress_bar=True,
                    batch_size=self.model_config.batch_size,
                    normalize_embeddings=True
                )
        except Exception as e:
            logger.error(f"Error getting embeddings: {e}")
            raise

class VertexAIDataGenerator:
    """Generate training data using Vertex AI's generative models"""
    
    def __init__(self, config: VertexAIConfig, aspect_config: AspectConfig):
        self.config = config
        self.aspect_config = aspect_config
        self.generative_model = None
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize generative model"""
        try:
            vertexai.init(project=self.config.project_id, location=self.config.location)
            self.generative_model = GenerativeModel("gemini-pro")
            logger.info("Initialized Vertex AI generative model")
        except Exception as e:
            logger.error(f"Failed to initialize generative model: {e}")
            raise
    
    def generate_training_data(self, num_samples: int = 1000) -> pd.DataFrame:
        """Generate training data using Vertex AI"""
        logger.info(f"Generating {num_samples} training samples using Vertex AI...")
        
        # Create prompts for different aspects and sentiments
        prompts = self._create_generation_prompts()
        
        generated_data = []
        samples_per_prompt = num_samples // len(prompts)
        
        for i, prompt in enumerate(prompts):
            logger.info(f"Generating data for prompt {i+1}/{len(prompts)}")
            
            try:
                # Generate text using Vertex AI
                response = self.generative_model.generate_content(
                    prompt,
                    generation_config={
                        "max_output_tokens": 2048,
                        "temperature": 0.7,
                        "top_p": 0.8,
                        "top_k": 40
                    }
                )
                
                # Parse response and create samples
                samples = self._parse_generated_response(response.text, samples_per_prompt)
                generated_data.extend(samples)
                
            except Exception as e:
                logger.warning(f"Error generating data for prompt {i}: {e}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(generated_data)
        
        # Add some manual examples to ensure quality
        manual_samples = self._create_manual_samples()
        manual_df = pd.DataFrame(manual_samples)
        
        df = pd.concat([df, manual_df], ignore_index=True)
        
        logger.info(f"Generated {len(df)} training samples")
        return df
    
    def _create_generation_prompts(self) -> List[str]:
        """Create prompts for data generation"""
        prompts = []
        
        # Create prompts for each aspect and sentiment combination
        for aspect, keywords in self.aspect_config.aspects.items():
            for sentiment in ['positive', 'negative', 'neutral']:
                prompt = f"""
Generate 20 realistic Japanese customer reviews about {aspect} ({', '.join(keywords[:3])}) 
with {sentiment} sentiment. Each review should be 20-100 characters long.

Format each review as:
Review: [Japanese text]
Sentiment: {sentiment}
Aspect: {aspect}

Example:
Review: このサービスの品質は素晴らしいです。
Sentiment: positive
Aspect: quality

Generate 20 similar reviews:
"""
                prompts.append(prompt)
        
        return prompts
    
    def _parse_generated_response(self, response_text: str, max_samples: int) -> List[Dict]:
        """Parse generated response into structured data"""
        samples = []
        lines = response_text.split('\n')
        
        current_review = None
        current_sentiment = None
        current_aspect = None
        
        for line in lines:
            line = line.strip()
            
            if line.startswith('Review:'):
                current_review = line.replace('Review:', '').strip()
            elif line.startswith('Sentiment:'):
                current_sentiment = line.replace('Sentiment:', '').strip()
            elif line.startswith('Aspect:'):
                current_aspect = line.replace('Aspect:', '').strip()
                
                # If we have all three components, create a sample
                if current_review and current_sentiment and current_aspect:
                    # Map sentiment to numeric
                    sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
                    
                    samples.append({
                        'review_text': current_review,
                        'sentiment': sentiment_map.get(current_sentiment, 1),
                        'aspect': current_aspect,
                        'text_length': len(current_review),
                        'generated': True
                    })
                    
                    # Reset for next sample
                    current_review = None
                    current_sentiment = None
                    current_aspect = None
                    
                    if len(samples) >= max_samples:
                        break
        
        return samples
    
    def _create_manual_samples(self) -> List[Dict]:
        """Create high-quality manual samples"""
        manual_samples = [
            # Quality - Positive
            {'review_text': 'この商品の品質は期待以上でした。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '高品質な材料を使用していて満足です。', 'sentiment': 2, 'aspect': 'quality'},
            {'review_text': '作りがしっかりしていて良い商品です。', 'sentiment': 2, 'aspect': 'quality'},
            
            # Quality - Negative
            {'review_text': '品質が悪くてがっかりしました。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': '安っぽい材料で作られている感じがします。', 'sentiment': 0, 'aspect': 'quality'},
            {'review_text': 'クオリティが低すぎて使い物になりません。', 'sentiment': 0, 'aspect': 'quality'},
            
            # Service - Positive
            {'review_text': 'スタッフの対応が素晴らしかったです。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': '親切で丁寧な接客に感謝します。', 'sentiment': 2, 'aspect': 'service'},
            {'review_text': 'サービスが良くて気持ちよく利用できました。', 'sentiment': 2, 'aspect': 'service'},
            
            # Service - Negative
            {'review_text': '店員の態度が悪くて不快でした。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': 'サービスの質が低くて残念です。', 'sentiment': 0, 'aspect': 'service'},
            {'review_text': '接客が雑で二度と来たくありません。', 'sentiment': 0, 'aspect': 'service'},
            
            # Price - Positive
            {'review_text': '価格が安くてお得感があります。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': 'コストパフォーマンスが良い商品です。', 'sentiment': 2, 'aspect': 'price'},
            {'review_text': '適正価格で満足しています。', 'sentiment': 2, 'aspect': 'price'},
            
            # Price - Negative
            {'review_text': '値段が高すぎて手が出ません。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': '価格設定が不適切だと思います。', 'sentiment': 0, 'aspect': 'price'},
            {'review_text': 'コストが高くて続けられません。', 'sentiment': 0, 'aspect': 'price'},
            
            # Neutral samples
            {'review_text': '普通の商品だと思います。', 'sentiment': 1, 'aspect': 'quality'},
            {'review_text': '特に良くも悪くもありません。', 'sentiment': 1, 'aspect': 'service'},
            {'review_text': '標準的な価格帯の商品です。', 'sentiment': 1, 'aspect': 'price'},
        ]
        
        # Add text_length and generated flag
        for sample in manual_samples:
            sample['text_length'] = len(sample['review_text'])
            sample['generated'] = False
        
        return manual_samples

class EnhancedBusinessInsightExtractor:
    """Enhanced business insight extractor with better analytics"""
    
    def __init__(self, aspect_config: AspectConfig = None, label_map: Dict[int, str] = None):
        self.aspect_config = aspect_config or AspectConfig()
        self.label_map = label_map or {0: "negative", 1: "neutral", 2: "positive"}
        self.colors = {
            'negative': '#FF6B6B',
            'neutral': '#FFD93D', 
            'positive': '#6BCF7F'
        }
        
    def calculate_aspect_metrics(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        """Calculate detailed metrics for each aspect"""
        metrics = {}
        
        for aspect in self.aspect_config.aspects.keys():
            aspect_col = f'aspect_{aspect}'
            if aspect_col not in df.columns:
                continue
                
            aspect_data = df[df[aspect_col] == 1]
            if len(aspect_data) == 0:
                continue
                
            total_mentions = len(aspect_data)
            sentiment_counts = aspect_data['sentiment'].value_counts()
            
            metrics[aspect] = {
                'total_mentions': total_mentions,
                'negative_count': sentiment_counts.get(0, 0),
                'neutral_count': sentiment_counts.get(1, 0),
                'positive_count': sentiment_counts.get(2, 0),
                'negative_rate': sentiment_counts.get(0, 0) / total_mentions * 100,
                'neutral_rate': sentiment_counts.get(1, 0) / total_mentions * 100,
                'positive_rate': sentiment_counts.get(2, 0) / total_mentions * 100,
                'sentiment_score': (sentiment_counts.get(2, 0) - sentiment_counts.get(0, 0)) / total_mentions
            }
            
        return metrics
    
    def generate_business_recommendations(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate actionable business recommendations"""
        metrics = self.calculate_aspect_metrics(df)
        recommendations = []
        
        print("\n" + "="*50)
        print("BUSINESS INSIGHTS & RECOMMENDATIONS")
        print("="*50)
        
        # Sort aspects by priority (negative rate * mentions)
        priority_aspects = []
        for aspect, data in metrics.items():
            priority_score = data['negative_rate'] * np.log(data['total_mentions'] + 1)
            priority_aspects.append((aspect, priority_score, data))
            
        priority_aspects.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\n📊 ASPECT PERFORMANCE SUMMARY")
        print("-" * 40)
        for aspect, _, data in priority_aspects:
            print(f"• {aspect.upper()}: {data['total_mentions']} mentions")
            print(f"  ├─ Negative: {data['negative_rate']:.1f}% ({data['negative_count']} reviews)")
            print(f"  ├─ Neutral:  {data['neutral_rate']:.1f}% ({data['neutral_count']} reviews)")
            print(f"  └─ Positive: {data['positive_rate']:.1f}% ({data['positive_count']} reviews)")
            
            # Generate specific recommendations
            if data['negative_rate'] > 30:
                recommendations.append(f"🔴 URGENT: Address {aspect} issues - {data['negative_rate']:.1f}% negative feedback")
            elif data['negative_rate'] > 20:
                recommendations.append(f"🟡 ATTENTION: Monitor {aspect} - {data['negative_rate']:.1f}% negative feedback")
            elif data['positive_rate'] > 70:
                recommendations.append(f"🟢 STRENGTH: Leverage {aspect} success - {data['positive_rate']:.1f}% positive feedback")
        
        print(f"\n🎯 ACTIONABLE RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(recommendations[:5], 1):
            print(f"{i}. {rec}")
            
        return {
            "metrics": metrics,
            "recommendations": recommendations,
            "priority_aspects": priority_aspects
        }
    
    def executive_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generate executive summary with key metrics"""
        print("\n" + "="*50)
        print("EXECUTIVE SUMMARY")
        print("="*50)
        
        total_reviews = len(df)
        sentiment_dist = df['sentiment'].value_counts(normalize=True) * 100
        
        # Calculate key metrics
        overall_sentiment_score = (
            sentiment_dist.get(2, 0) - sentiment_dist.get(0, 0)
        ) / 100
        
        avg_text_length = df['text_length'].mean()
        
        print(f"📈 Total Reviews Analyzed: {total_reviews:,}")
        print(f"📊 Overall Sentiment Score: {overall_sentiment_score:.3f} (-1 to +1)")
        print(f"📝 Average Review Length: {avg_text_length:.0f} characters")
        
        print(f"\n🎭 Sentiment Distribution:")
        for sentiment in [0, 1, 2]:
            label = self.label_map[sentiment]
            pct = sentiment_dist.get(sentiment, 0)
            bar_length = int(pct / 2)  # Scale for display
            bar = "█" * bar_length + "░" * (50 - bar_length)
            print(f"  {label.capitalize():>8}: {pct:5.1f}% |{bar}|")
        
        return {
            "total_reviews": total_reviews,
            "sentiment_distribution": sentiment_dist.to_dict(),
            "overall_sentiment_score": overall_sentiment_score,
            "avg_text_length": avg_text_length
        }

class VertexAIModelDeployer:
    """Deploy trained model to Vertex AI endpoint"""
    
    def __init__(self, vertex_config: VertexAIConfig):
        self.vertex_config = vertex_config
        self.model = None
        self.endpoint = None
        
    def create_custom_container_image(self, model_path: str) -> str:
        """Create custom container image for model serving"""
        
        # Dockerfile content
        dockerfile_content = """
FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \\
    gcc \\
    g++ \\
    && rm -rf /var/lib/apt/lists/*

# Copy requirements
COPY requirements.txt .

# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy model artifacts and predictor
COPY model_artifacts.pkl .
COPY predictor.py .

# Expose port
EXPOSE 8080

# Set environment variables
ENV AIP_STORAGE_URI=/app
ENV AIP_HEALTH_ROUTE=/health
ENV AIP_PREDICT_ROUTE=/predict

# Start the server
CMD ["python", "predictor.py"]
"""
        
        # Requirements.txt content
        requirements_content = """
google-cloud-aiplatform==1.38.0
xgboost==1.7.6
scikit-learn==1.3.0
pandas==2.0.3
numpy==1.24.3
sentence-transformers==2.2.2
vertexai==1.38.0
flask==2.3.2
"""
        
        # Predictor.py content
        predictor_content = """
import os
import json
import pickle
from flask import Flask, request, jsonify
from google.cloud import aiplatform
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Global predictor instance
predictor = None

def load_predictor():
    global predictor
    try:
        from vertex_ai_absa_pipeline import VertexAICustomPredictor
        predictor = VertexAICustomPredictor('model_artifacts.pkl')
        logger.info("Predictor loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load predictor: {e}")
        raise

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "healthy"}), 200

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Parse request
        data = request.get_json()
        instances = data.get('instances', [])
        
        # Make predictions
        predictions = predictor.predict(instances)
        
        return jsonify({"predictions": predictions}), 200
        
    except Exception as e:
        logger.error(f"Prediction error: {e}")
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    load_predictor()
    app.run(host='0.0.0.0', port=8080)
"""
        
        # Create temporary directory for Docker build
        import tempfile
        import shutil
        
        with tempfile.TemporaryDirectory() as temp_dir:
            # Write files
            with open(os.path.join(temp_dir, 'Dockerfile'), 'w') as f:
                f.write(dockerfile_content)
            
            with open(os.path.join(temp_dir, 'requirements.txt'), 'w') as f:
                f.write(requirements_content)
            
            with open(os.path.join(temp_dir, 'predictor.py'), 'w') as f:
                f.write(predictor_content)
            
            # Copy model artifacts
            shutil.copy(model_path, os.path.join(temp_dir, 'model_artifacts.pkl'))
            
            # Build and push container image
            image_uri = f"gcr.io/{self.vertex_config.project_id}/absa-predictor:latest"
            
            logger.info(f"Building container image: {image_uri}")
            
            # Build Docker image (requires Docker installed)
            import subprocess
            
            subprocess.run([
                'docker', 'build', '-t', image_uri, temp_dir
            ], check=True)
            
            # Push to registry
            subprocess.run([
                'docker', 'push', image_uri
            ], check=True)
            
            return image_uri
    
    def upload_model(self, model_gcs_path: str, container_image_uri: str = None) -> aiplatform.Model:
        """Upload model to Vertex AI Model Registry"""

        logger.info(f"Uploading model to Vertex AI Model Registry...")

        model = aiplatform.Model.upload(
            display_name=self.vertex_config.model_display_name,
            artifact_uri=os.path.dirname(model_gcs_path),
            serving_container_image_uri=container_image_uri or "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
            serving_container_predict_route="/predict",
            serving_container_health_route="/health",
            project=self.vertex_config.project_id,
            location=self.vertex_config.location,
            sync=True,
        )
        logger.info(f"Model uploaded. Resource name: {model.resource_name}")
        return model

    def deploy_model(self, model: aiplatform.Model) -> aiplatform.Endpoint:
        """Deploy uploaded model to Vertex AI endpoint"""

        logger.info("Deploying model to Vertex AI endpoint...")
        deployed_model = model.deploy(
            machine_type=self.vertex_config.machine_type,
            min_replica_count=self.vertex_config.min_replica_count,
            max_replica_count=self.vertex_config.max_replica_count,
            accelerator_type=self.vertex_config.accelerator_type if self.vertex_config.use_gpu else None,
            accelerator_count=self.vertex_config.accelerator_count if self.vertex_config.use_gpu else None,
            traffic_split={"0": 100},  # Route all traffic to this model version
            sync=True
        )
        logger.info(f"Model deployed to endpoint: {deployed_model.resource_name}")
        return deployed_model

# ---- Example End-to-End Usage ----

if __name__ == "__main__":
    # Replace these with your actual settings!
    vertex_config = VertexAIConfig(
        project_id="your-gcp-project",
        location="us-central1",
        staging_bucket="gs://your-staging-bucket",
        model_display_name="japanese-absa-model",
        endpoint_display_name="japanese-absa-endpoint",
        machine_type="n1-standard-4",
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
        use_gpu=True,
        min_replica_count=1,
        max_replica_count=2,
        explanation_config=False,
    )

    # 1. (Assume model was already trained & saved)
    model_gcs_path = "gs://your-staging-bucket/absa_models/model_artifacts.pkl"
    container_image_uri = "gcr.io/your-gcp-project/absa-predictor:latest"

    # 2. Deploy
    deployer = VertexAIModelDeployer(vertex_config)
    # Optionally: deployer.create_custom_container_image(model_path)  # If building/pushing container here
    model = deployer.upload_model(model_gcs_path, container_image_uri=container_image_uri)
    endpoint = deployer.deploy_model(model)

    print(f"\n✅ Model deployed! Endpoint resource name:\n{endpoint.resource_name}\n")