In [None]:
# ============================================================================
# VERTEX AI XGBOOST PIPELINE WITH ADVANCED EMBEDDINGS
# ============================================================================

# 1. VERTEX AI SETUP AND IMPORTS
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud import storage
from google.cloud.aiplatform import CustomJob, CustomTrainingJob
from google.cloud.aiplatform.gapic import JobState
import google.auth
from google.auth import default

# ML imports
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 2. VERTEX AI CONFIGURATION
class VertexAIConfig:
    def __init__(self, project_id, location="us-central1", staging_bucket=None):
        self.project_id = project_id
        self.location = location
        self.staging_bucket = staging_bucket or f"gs://{project_id}-ml-staging"
        
        # Initialize Vertex AI
        aiplatform.init(
            project=project_id,
            location=location,
            staging_bucket=self.staging_bucket
        )
        
        logger.info(f"Vertex AI initialized for project: {project_id}")

# 3. DATASET ADAPTER CLASS
class DatasetAdapter:
    """Flexible dataset adapter for different data formats and sources"""
    
    def __init__(self, config):
        self.config = config
        self.storage_client = storage.Client(project=config.project_id)
    
    def load_dataset(self, dataset_config):
        """
        Load dataset based on configuration
        
        dataset_config example:
        {
            'type': 'csv',  # 'csv', 'bigquery', 'gcs'
            'source': 'path/to/file.csv' or 'project.dataset.table',
            'text_column': 'text',
            'label_column': 'label',
            'additional_features': ['feature1', 'feature2']
        }
        """
        if dataset_config['type'] == 'csv':
            return self._load_csv(dataset_config)
        elif dataset_config['type'] == 'bigquery':
            return self._load_bigquery(dataset_config)
        elif dataset_config['type'] == 'gcs':
            return self._load_gcs(dataset_config)
        else:
            raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
    
    def _load_csv(self, config):
        """Load dataset from local CSV"""
        df = pd.read_csv(config['source'])
        return self._process_dataframe(df, config)
    
    def _load_bigquery(self, config):
        """Load dataset from BigQuery"""
        from google.cloud import bigquery
        client = bigquery.Client(project=self.config.project_id)
        
        query = f"SELECT * FROM `{config['source']}`"
        if 'query' in config:
            query = config['query']
            
        df = client.query(query).to_dataframe()
        return self._process_dataframe(df, config)
    
    def _load_gcs(self, config):
        """Load dataset from Google Cloud Storage"""
        # Download from GCS to local temp file
        bucket_name = config['source'].split('/')[2]
        blob_path = '/'.join(config['source'].split('/')[3:])
        
        bucket = self.storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_path)
        
        local_path = f"/tmp/{blob_path.split('/')[-1]}"
        blob.download_to_filename(local_path)
        
        df = pd.read_csv(local_path)
        return self._process_dataframe(df, config)
    
    def _process_dataframe(self, df, config):
        """Process dataframe to standard format"""
        processed_df = pd.DataFrame()
        
        # Standard columns
        processed_df['text'] = df[config['text_column']]
        processed_df['label'] = df[config['label_column']]
        
        # Additional features if specified
        if 'additional_features' in config:
            for feature in config['additional_features']:
                if feature in df.columns:
                    processed_df[feature] = df[feature]
        
        # Encode labels if they're strings
        if processed_df['label'].dtype == 'object':
            le = LabelEncoder()
            processed_df['label'] = le.fit_transform(processed_df['label'])
            processed_df['label_names'] = le.classes_
        
        return processed_df

# 4. VERTEX AI EMBEDDING EXTRACTOR
class VertexAIEmbeddingExtractor:
    """Extract embeddings using Vertex AI or local models"""
    
    def __init__(self, config, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.config = config
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
    def initialize_model(self):
        """Initialize the embedding model"""
        logger.info(f"Initializing model: {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model.to(self.device)
        self.model.eval()
        
    def get_embeddings(self, texts, batch_size=16, max_length=256):
        """Extract embeddings from texts"""
        if self.model is None:
            self.initialize_model()
            
        all_embeddings = []
        
        logger.info(f"Extracting embeddings for {len(texts)} texts...")
        
        for i in range(0, len(texts), batch_size):
            batch = list(texts[i:i+batch_size])
            tokens = self.tokenizer(
                batch, 
                truncation=True, 
                padding=True, 
                max_length=max_length, 
                return_tensors='pt'
            )
            tokens = {k: v.to(self.device) for k, v in tokens.items()}
            
            with torch.no_grad():
                try:
                    outputs = self.model(**tokens)
                    # Use mean pooling of last hidden states
                    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
                    all_embeddings.append(embeddings)
                except Exception as e:
                    logger.error(f"Error in batch {i//batch_size}: {e}")
                    # Fallback to zero embeddings
                    embeddings = np.zeros((len(batch), 384))  # Adjust dimension as needed
                    all_embeddings.append(embeddings)
        
        return np.vstack(all_embeddings)

# 5. VERTEX AI XGBOOST TRAINER
class VertexAIXGBoostTrainer:
    """XGBoost trainer with Vertex AI integration"""
    
    def __init__(self, config):
        self.config = config
        self.best_model = None
        self.best_params = None
        self.best_score = None
        
    def optimize_hyperparameters(self, X_train, y_train, n_trials=50, timeout=600):
        """Optimize XGBoost hyperparameters using Optuna"""
        
        # Create optimization subset for faster tuning
        optimization_size = min(2000, len(X_train))
        opt_indices = np.random.choice(len(X_train), optimization_size, replace=False)
        X_opt = X_train[opt_indices]
        y_opt = y_train[opt_indices]
        
        logger.info(f"Starting hyperparameter optimization with {optimization_size} samples...")
        
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 800, step=50),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0.0, 2.0),
                'eval_metric': 'mlogloss',
                'random_state': 42,
                'tree_method': 'hist',  # Use hist for better compatibility
                'objective': 'multi:softprob',
                'num_class': len(np.unique(y_train))
            }
            
            model = xgb.XGBClassifier(**params)
            cv_scores = cross_val_score(
                model, X_opt, y_opt, 
                cv=3, scoring='f1_weighted', n_jobs=1
            )
            return cv_scores.mean()
        
        study = optuna.create_study(
            direction='maximize',
            sampler=TPESampler(seed=42),
            pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )
        
        study.optimize(objective, n_trials=n_trials, timeout=timeout)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        logger.info(f"Best F1 Score: {self.best_score:.4f}")
        logger.info(f"Best Parameters: {self.best_params}")
        
        return self.best_params, self.best_score
    
    def train_final_model(self, X_train, y_train, X_test=None, y_test=None):
        """Train final model with optimized parameters"""
        if self.best_params is None:
            raise ValueError("Must run optimize_hyperparameters first")
        
        final_params = self.best_params.copy()
        final_params.update({
            'eval_metric': 'mlogloss',
            'random_state': 42,
            'tree_method': 'hist',
            'objective': 'multi:softprob',
            'num_class': len(np.unique(y_train))
        })
        
        self.best_model = xgb.XGBClassifier(**final_params)
        
        if X_test is not None and y_test is not None:
            self.best_model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                verbose=False
            )
        else:
            self.best_model.fit(X_train, y_train)
        
        logger.info("Final model training completed")
        return self.best_model

# 6. MAIN PIPELINE CLASS
class VertexAIMLPipeline:
    """Main pipeline class that orchestrates the entire process"""
    
    def __init__(self, project_id, location="us-central1", staging_bucket=None):
        self.config = VertexAIConfig(project_id, location, staging_bucket)
        self.dataset_adapter = DatasetAdapter(self.config)
        self.embedding_extractor = VertexAIEmbeddingExtractor(self.config)
        self.trainer = VertexAIXGBoostTrainer(self.config)
        
    def run_pipeline(self, dataset_config, test_size=0.2, embedding_model=None):
        """Run the complete ML pipeline"""
        
        # 1. Load dataset
        logger.info("Loading dataset...")
        df = self.dataset_adapter.load_dataset(dataset_config)
        
        # 2. Split data
        train_df, test_df = train_test_split(
            df, test_size=test_size, random_state=42, 
            stratify=df['label'] if len(df['label'].unique()) > 1 else None
        )
        
        # 3. Extract embeddings
        if embedding_model:
            self.embedding_extractor.model_name = embedding_model
            
        X_train_embeddings = self.embedding_extractor.get_embeddings(train_df['text'])
        X_test_embeddings = self.embedding_extractor.get_embeddings(test_df['text'])
        
        # 4. Combine with additional features if available
        additional_features = [col for col in train_df.columns 
                             if col not in ['text', 'label', 'label_names']]
        
        if additional_features:
            logger.info(f"Adding additional features: {additional_features}")
            X_train_additional = train_df[additional_features].fillna(0).values
            X_test_additional = test_df[additional_features].fillna(0).values
            
            X_train_final = np.hstack([X_train_embeddings, X_train_additional])
            X_test_final = np.hstack([X_test_embeddings, X_test_additional])
        else:
            X_train_final = X_train_embeddings
            X_test_final = X_test_embeddings
        
        y_train = train_df['label'].values
        y_test = test_df['label'].values
        
        logger.info(f"Final feature shapes: Train {X_train_final.shape}, Test {X_test_final.shape}")
        
        # 5. Optimize and train model
        self.trainer.optimize_hyperparameters(X_train_final, y_train)
        model = self.trainer.train_final_model(X_train_final, y_train, X_test_final, y_test)
        
        # 6. Evaluate
        y_pred = model.predict(X_test_final)
        y_proba = model.predict_proba(X_test_final)
        
        logger.info("\nFinal Model Classification Report:")
        print(classification_report(y_test, y_pred, digits=4))
        
        # 7. Save model to Vertex AI Model Registry (optional)
        self.save_model_to_vertex_ai(model, dataset_config.get('model_name', 'xgboost-model'))
        
        return {
            'model': model,
            'predictions': y_pred,
            'probabilities': y_proba,
            'test_labels': y_test,
            'feature_importance': model.feature_importances_,
            'best_params': self.trainer.best_params,
            'best_score': self.trainer.best_score
        }
    
    def save_model_to_vertex_ai(self, model, model_name):
        """Save model to Vertex AI Model Registry"""
        try:
            # Save model locally first
            model_path = f"/tmp/{model_name}.pkl"
            import pickle
            with open(model_path, 'wb') as f:
                pickle.dump(model, f)
            
            # Upload to Vertex AI (simplified - in practice you'd want more robust model serving)
            vertex_model = aiplatform.Model.upload(
                display_name=model_name,
                artifact_uri=self.config.staging_bucket + f"/models/{model_name}/",
                serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-4:latest"
            )
            
            logger.info(f"Model uploaded to Vertex AI: {vertex_model.resource_name}")
            
        except Exception as e:
            logger.warning(f"Could not upload model to Vertex AI: {e}")

# 7. EXAMPLE USAGE
def example_usage():
    """Example of how to use the pipeline"""
    
    # Initialize pipeline
    pipeline = VertexAIMLPipeline(
        project_id="your-project-id",
        location="us-central1",
        staging_bucket="gs://your-bucket-name"
    )
    
    # Define dataset configuration
    dataset_config = {
        'type': 'csv',  # or 'bigquery', 'gcs'
        'source': 'path/to/your/dataset.csv',
        'text_column': 'Sentence',  # Adjust to your text column name
        'label_column': 'sentiment',  # Adjust to your label column name
        'additional_features': [  # Optional additional features
            "Writer_Joy", "Writer_Sadness", "Writer_Anticipation", 
            "Writer_Surprise", "Writer_Anger", "Writer_Fear", 
            "Writer_Disgust", "Writer_Trust"
        ],
        'model_name': 'sentiment-xgboost-model'
    }
    
    # Run pipeline
    results = pipeline.run_pipeline(
        dataset_config=dataset_config,
        test_size=0.2,
        embedding_model="sentence-transformers/all-MiniLM-L6-v2"  # Optional: specify embedding model
    )
    
    return results

# Uncomment to run example
# results = example_usage()

In [None]:
# Advanced Multi-Task Japanese Sentiment Analysis with Vertex AI
# This notebook demonstrates how to use Vertex AI for Japanese sentiment analysis
# with multi-task learning capabilities

import os
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Tuple
import json
import time
from dataclasses import dataclass
from sklearn.metrics import classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Vertex AI imports
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict
import vertexai
from vertexai.language_models import TextGenerationModel, ChatModel
from vertexai.generative_models import GenerativeModel
import vertexai.preview.generative_models as generative_models

# Authentication and project setup
# Make sure you have set up authentication:
# gcloud auth application-default login
# OR set GOOGLE_APPLICATION_CREDENTIALS environment variable

PROJECT_ID = "project-id"  # Replace with your project ID
LOCATION = "us-central1"  # or your preferred region
ENDPOINT_ID = None  # We'll use pre-trained models

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

print(f"Vertex AI initialized for project: {PROJECT_ID}")
print(f"Location: {LOCATION}")

# =============================================================================
# 1. DATA PREPARATION AND PREPROCESSING
# =============================================================================

@dataclass
class SentimentData:
    sentence: str
    sentiment: int  # 0: negative, 1: neutral, 2: positive
    joy: float = 0.0
    sadness: float = 0.0
    anger: float = 0.0

class DataPreprocessor:
    """Data preprocessing utilities for Japanese text"""
    
    def __init__(self):
        self.label_mapping = {
            'negative': 0, 'neutral': 1, 'positive': 2,
            0: 'negative', 1: 'neutral', 2: 'positive'
        }
    
    def clean_japanese_text(self, text: str) -> str:
        """Clean and normalize Japanese text"""
        if pd.isna(text) or text.strip() == "":
            return "[EMPTY]"
        
        # Basic cleaning
        text = str(text).strip()
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def prepare_dataset(self, df: pd.DataFrame) -> List[SentimentData]:
        """Convert DataFrame to SentimentData objects"""
        data = []
        
        required_columns = ['Sentence', 'sentiment']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"DataFrame must contain columns: {required_columns}")
        
        for _, row in df.iterrows():
            cleaned_sentence = self.clean_japanese_text(row['Sentence'])
            
            # Handle emotion scores if available
            joy = row.get('Joy', 0.0) if 'Joy' in df.columns else 0.0
            sadness = row.get('Sadness', 0.0) if 'Sadness' in df.columns else 0.0
            anger = row.get('Anger', 0.0) if 'Anger' in df.columns else 0.0
            
            data.append(SentimentData(
                sentence=cleaned_sentence,
                sentiment=int(row['sentiment']),
                joy=float(joy),
                sadness=float(sadness),
                anger=float(anger)
            ))
        
        return data

# =============================================================================
# 2. VERTEX AI MODEL INTERFACE
# =============================================================================

class VertexAIMultiTaskModel:
    """
    Advanced multi-task sentiment analysis using Vertex AI models
    Supports multiple model types including Gemini Pro, PaLM, and custom models
    """
    
    def __init__(self, model_name: str = "gemini-1.5-pro-002"):
        """
        Initialize with the best available models for Japanese text analysis
        
        Recommended models:
        - gemini-1.5-pro-002: Latest Gemini Pro with excellent multilingual support
        - gemini-1.5-flash-002: Faster version for real-time applications
        - text-bison@002: PaLM 2 for Text (good for Japanese)
        """
        self.model_name = model_name
        self.model = None
        self.generation_config = None
        self.safety_settings = None
        
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize the appropriate model based on model_name"""
        try:
            if "gemini" in self.model_name.lower():
                self.model = GenerativeModel(self.model_name)
                
                # Configure generation parameters for consistent results
                self.generation_config = generative_models.GenerationConfig(
                    max_output_tokens=1024,
                    temperature=0.1,  # Low temperature for consistent classification
                    top_p=0.8,
                    top_k=40,
                )
                
                # Safety settings
                self.safety_settings = [
                    generative_models.SafetySetting(
                        category=generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                        threshold=generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
                    ),
                    generative_models.SafetySetting(
                        category=generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                        threshold=generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
                    ),
                ]
                
            elif "bison" in self.model_name.lower():
                self.model = TextGenerationModel.from_pretrained(self.model_name)
                
            print(f"Successfully initialized {self.model_name}")
            
        except Exception as e:
            print(f"❌ Error initializing model {self.model_name}: {str(e)}")
            # Fallback to Gemini Pro
            self.model_name = "gemini-1.5-pro-002"
            self.model = GenerativeModel(self.model_name)
            print(f"Falling back to {self.model_name}")
    
    def create_multi_task_prompt(self, text: str) -> str:
        """Create a comprehensive prompt for multi-task sentiment analysis"""
        
        prompt = f"""あなたは日本語テキストの感情分析の専門家です。以下のテキストを分析して、感情情報をJSON形式で出力してください。

テキスト: "{text}"

以下の形式で回答してください：

{{
    "sentiment": "positive/neutral/negative",
    "sentiment_score": 0-2の整数値 (0=negative, 1=neutral, 2=positive),
    "confidence": 0.0-1.0の信頼度,
    "emotions": {{
        "joy": 0.0-1.0のスコア,
        "sadness": 0.0-1.0のスコア,
        "anger": 0.0-1.0のスコア,
        "fear": 0.0-1.0のスコア,
        "surprise": 0.0-1.0のスコア
    }},
    "reasoning": "判断の根拠を簡潔に説明"
}}

重要な指示：
1. 日本語の文脈とニュアンスを正確に理解してください
2. 敬語や間接的な表現も考慮してください
3. 文化的な背景も判断に含めてください
4. JSONフォーマットを厳密に守ってください
5. reasoning以外は英語で記述してください"""

        return prompt
    
    def predict_single(self, text: str, max_retries: int = 3) -> Dict[str, Any]:
        """Predict sentiment for a single text with retry logic"""
        
        prompt = self.create_multi_task_prompt(text)
        
        for attempt in range(max_retries):
            try:
                if "gemini" in self.model_name.lower():
                    response = self.model.generate_content(
                        prompt,
                        generation_config=self.generation_config,
                        safety_settings=self.safety_settings
                    )
                    result_text = response.text
                    
                elif "bison" in self.model_name.lower():
                    response = self.model.predict(
                        prompt,
                        max_output_tokens=1024,
                        temperature=0.1,
                        top_p=0.8,
                        top_k=40
                    )
                    result_text = response.text
                
                # Parse JSON response
                try:
                    # Extract JSON from response
                    start_idx = result_text.find('{')
                    end_idx = result_text.rfind('}') + 1
                    
                    if start_idx == -1 or end_idx == 0:
                        raise ValueError("No JSON found in response")
                    
                    json_str = result_text[start_idx:end_idx]
                    result = json.loads(json_str)
                    
                    # Validate and normalize result
                    result = self._normalize_prediction(result)
                    return result
                    
                except (json.JSONDecodeError, ValueError) as e:
                    print(f"JSON parsing error on attempt {attempt + 1}: {str(e)}")
                    if attempt == max_retries - 1:
                        # Return default prediction
                        return self._get_default_prediction()
                    continue
                    
            except Exception as e:
                print(f"API error on attempt {attempt + 1}: {str(e)}")
                if attempt == max_retries - 1:
                    return self._get_default_prediction()
                time.sleep(2 ** attempt)  # Exponential backoff
        
        return self._get_default_prediction()
    
    def _normalize_prediction(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Normalize and validate prediction result"""
        
        # Ensure required fields exist
        normalized = {
            'sentiment': result.get('sentiment', 'neutral'),
            'sentiment_score': result.get('sentiment_score', 1),
            'confidence': result.get('confidence', 0.5),
            'emotions': result.get('emotions', {}),
            'reasoning': result.get('reasoning', 'No reasoning provided')
        }
        
        # Normalize sentiment
        if normalized['sentiment'] not in ['positive', 'neutral', 'negative']:
            normalized['sentiment'] = 'neutral'
        
        # Normalize sentiment_score
        if normalized['sentiment'] == 'positive':
            normalized['sentiment_score'] = 2
        elif normalized['sentiment'] == 'negative':
            normalized['sentiment_score'] = 0
        else:
            normalized['sentiment_score'] = 1
        
        # Ensure confidence is between 0 and 1
        normalized['confidence'] = max(0.0, min(1.0, float(normalized['confidence'])))
        
        # Normalize emotion scores
        emotions = normalized['emotions']
        for emotion in ['joy', 'sadness', 'anger', 'fear', 'surprise']:
            if emotion in emotions:
                emotions[emotion] = max(0.0, min(1.0, float(emotions[emotion])))
            else:
                emotions[emotion] = 0.0
        
        return normalized
    
    def _get_default_prediction(self) -> Dict[str, Any]:
        """Return default prediction when API fails"""
        return {
            'sentiment': 'neutral',
            'sentiment_score': 1,
            'confidence': 0.1,
            'emotions': {
                'joy': 0.0,
                'sadness': 0.0,
                'anger': 0.0,
                'fear': 0.0,
                'surprise': 0.0
            },
            'reasoning': 'Default prediction due to API failure'
        }
    
    def predict_batch(self, texts: List[str], batch_size: int = 5) -> List[Dict[str, Any]]:
        """Predict sentiment for a batch of texts with rate limiting"""
        
        results = []
        total = len(texts)
        
        print(f"Starting batch prediction for {total} texts...")
        
        for i in range(0, total, batch_size):
            batch = texts[i:i + batch_size]
            batch_results = []
            
            for j, text in enumerate(batch):
                print(f"Processing {i + j + 1}/{total}: {text[:50]}...")
                result = self.predict_single(text)
                batch_results.append(result)
                
                # Rate limiting - avoid hitting API limits
                if j < len(batch) - 1:  # Don't wait after the last item in batch
                    time.sleep(1)  # 1 second between requests
            
            results.extend(batch_results)
            
            # Longer pause between batches
            if i + batch_size < total:
                print(f"Completed batch {i//batch_size + 1}, pausing...")
                time.sleep(3)
        
        print("Batch prediction completed!")
        return results

# =============================================================================
# 3. EVALUATION AND ANALYSIS
# =============================================================================

class ModelEvaluator:
    """Comprehensive evaluation of multi-task sentiment analysis"""
    
    def __init__(self):
        self.results = {}
    
    def evaluate_predictions(self, y_true: List[int], predictions: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Evaluate model performance"""
        
        # Extract predictions
        y_pred = [pred['sentiment_score'] for pred in predictions]
        confidences = [pred['confidence'] for pred in predictions]
        
        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        f1_macro = f1_score(y_true, y_pred, average='macro')
        f1_weighted = f1_score(y_true, y_pred, average='weighted')
        
        # Classification report
        class_report = classification_report(
            y_true, y_pred, 
            target_names=['Negative', 'Neutral', 'Positive'],
            output_dict=True
        )
        
        # Confidence analysis
        avg_confidence = np.mean(confidences)
        confidence_by_class = {}
        for i in range(3):
            class_confidences = [conf for true_label, conf in zip(y_true, confidences) if true_label == i]
            confidence_by_class[i] = np.mean(class_confidences) if class_confidences else 0.0
        
        evaluation_results = {
            'accuracy': accuracy,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'classification_report': class_report,
            'average_confidence': avg_confidence,
            'confidence_by_class': confidence_by_class,
            'predictions': y_pred,
            'true_labels': y_true,
            'confidences': confidences
        }
        
        return evaluation_results
    
    def analyze_emotions(self, predictions: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze emotion patterns"""
        
        emotions_data = {
            'joy': [],
            'sadness': [],
            'anger': [],
            'fear': [],
            'surprise': []
        }
        
        for pred in predictions:
            emotions = pred.get('emotions', {})
            for emotion, score in emotions.items():
                if emotion in emotions_data:
                    emotions_data[emotion].append(score)
        
        # Calculate statistics
        emotion_stats = {}
        for emotion, scores in emotions_data.items():
            if scores:
                emotion_stats[emotion] = {
                    'mean': np.mean(scores),
                    'std': np.std(scores),
                    'max': np.max(scores),
                    'min': np.min(scores)
                }
            else:
                emotion_stats[emotion] = {
                    'mean': 0.0, 'std': 0.0, 'max': 0.0, 'min': 0.0
                }
        
        return emotion_stats
    
    def plot_results(self, evaluation_results: Dict[str, Any], save_path: str = None):
        """Create visualizations of the results"""
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Confusion Matrix
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(evaluation_results['true_labels'], evaluation_results['predictions'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Negative', 'Neutral', 'Positive'],
                   yticklabels=['Negative', 'Neutral', 'Positive'],
                   ax=axes[0,0])
        axes[0,0].set_title('Confusion Matrix')
        axes[0,0].set_ylabel('True Label')
        axes[0,0].set_xlabel('Predicted Label')
        
        # Confidence Distribution
        axes[0,1].hist(evaluation_results['confidences'], bins=20, alpha=0.7, color='skyblue')
        axes[0,1].set_title('Confidence Score Distribution')
        axes[0,1].set_xlabel('Confidence Score')
        axes[0,1].set_ylabel('Frequency')
        
        # Performance by Class
        class_report = evaluation_results['classification_report']
        classes = ['Negative', 'Neutral', 'Positive']
        f1_scores = [class_report[str(i)]['f1-score'] for i in range(3)]
        
        axes[1,0].bar(classes, f1_scores, color=['red', 'gray', 'green'], alpha=0.7)
        axes[1,0].set_title('F1-Score by Class')
        axes[1,0].set_ylabel('F1-Score')
        axes[1,0].set_ylim(0, 1)
        
        # Overall Metrics
        metrics = ['Accuracy', 'F1-Macro', 'F1-Weighted']
        values = [evaluation_results['accuracy'], 
                 evaluation_results['f1_macro'], 
                 evaluation_results['f1_weighted']]
        
        axes[1,1].bar(metrics, values, color='orange', alpha=0.7)
        axes[1,1].set_title('Overall Performance Metrics')
        axes[1,1].set_ylabel('Score')
        axes[1,1].set_ylim(0, 1)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        plt.show()

# =============================================================================
# 4. MAIN EXECUTION PIPELINE
# =============================================================================

def main_pipeline(train_df: pd.DataFrame, test_df: pd.DataFrame, 
                 model_name: str = "gemini-1.5-pro-002"):
    """
    Main execution pipeline for Vertex AI multi-task sentiment analysis
    """
    
    print("=" * 80)
    print("VERTEX AI MULTI-TASK JAPANESE SENTIMENT ANALYSIS")
    print("=" * 80)
    
    # 1. Data Preparation
    print("\nStep 1: Data Preparation")
    preprocessor = DataPreprocessor()
    
    train_data = preprocessor.prepare_dataset(train_df)
    test_data = preprocessor.prepare_dataset(test_df)
    
    print(f"Training data: {len(train_data)} samples")
    print(f"Test data: {len(test_data)} samples")
    
    # 2. Model Initialization
    print(f"\n Step 2: Initializing Vertex AI Model ({model_name})")
    model = VertexAIMultiTaskModel(model_name=model_name)
    
    # 3. Predictions
    print("\n Step 3: Generating Predictions")
    test_sentences = [item.sentence for item in test_data]
    predictions = model.predict_batch(test_sentences, batch_size=3)  # Smaller batch for stability
    
    # 4. Evaluation
    print("\n Step 4: Model Evaluation")
    evaluator = ModelEvaluator()
    
    y_true = [item.sentiment for item in test_data]
    evaluation_results = evaluator.evaluate_predictions(y_true, predictions)
    
    # Print results
    print(f"\n PERFORMANCE RESULTS:")
    print(f"Accuracy: {evaluation_results['accuracy']:.4f}")
    print(f"F1-Score (Macro): {evaluation_results['f1_macro']:.4f}")
    print(f"F1-Score (Weighted): {evaluation_results['f1_weighted']:.4f}")
    print(f"Average Confidence: {evaluation_results['average_confidence']:.4f}")
    
    print(f"\n DETAILED CLASSIFICATION REPORT:")
    class_report = evaluation_results['classification_report']
    for class_name in ['0', '1', '2']:  # negative, neutral, positive
        if class_name in class_report:
            metrics = class_report[class_name]
            label = ['Negative', 'Neutral', 'Positive'][int(class_name)]
            print(f"{label:>10}: Precision={metrics['precision']:.3f}, "
                  f"Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}")
    
    # 5. Emotion Analysis
    print("\n Step 5: Emotion Analysis")
    emotion_stats = evaluator.analyze_emotions(predictions)
    
    print("Average Emotion Scores:")
    for emotion, stats in emotion_stats.items():
        print(f"{emotion.capitalize():>10}: {stats['mean']:.3f} (±{stats['std']:.3f})")
    
    # 6. Visualization
    print("\n Step 6: Creating Visualizations")
    try:
        evaluator.plot_results(evaluation_results, save_path='vertex_ai_results.png')
    except Exception as e:
        print(f"️ Visualization error: {str(e)}")
    
    # 7. Save Results
    print("\n Step 7: Saving Results")
    results_summary = {
        'model_name': model_name,
        'test_samples': len(test_data),
        'accuracy': evaluation_results['accuracy'],
        'f1_macro': evaluation_results['f1_macro'],
        'f1_weighted': evaluation_results['f1_weighted'],
        'average_confidence': evaluation_results['average_confidence'],
        'emotion_stats': emotion_stats,
        'timestamp': pd.Timestamp.now().isoformat()
    }
    
    # Save detailed results
    results_df = pd.DataFrame({
        'sentence': test_sentences,
        'true_sentiment': y_true,
        'predicted_sentiment': evaluation_results['predictions'],
        'confidence': evaluation_results['confidences'],
        'joy': [p['emotions']['joy'] for p in predictions],
        'sadness': [p['emotions']['sadness'] for p in predictions],
        'anger': [p['emotions']['anger'] for p in predictions],
        'reasoning': [p['reasoning'] for p in predictions]
    })
    
    results_df.to_csv('vertex_ai_detailed_results.csv', index=False, encoding='utf-8')
    
    with open('vertex_ai_summary.json', 'w', encoding='utf-8') as f:
        json.dump(results_summary, f, ensure_ascii=False, indent=2)
    
    print(" Results saved to:")
    print("  - vertex_ai_detailed_results.csv")
    print("  - vertex_ai_summary.json")
    print("  - vertex_ai_results.png")
    
    return evaluation_results, predictions

# =============================================================================
# 5. USAGE EXAMPLE
# =============================================================================

if __name__ == "__main__":
    # Example usage - replace with your actual data loading
    print("📝 Loading data...")
    
    # Create sample data for demonstration
    # Replace this section with your actual data loading code
    sample_data = {
        'Sentence': [
            'この映画は本当に素晴らしかった！',
            'まあまあの出来だと思います。',
            'ひどい映画でした。時間の無駄。',
            '普通の内容でした。',
            '感動的で涙が出ました。'
        ],
        'sentiment': [2, 1, 0, 1, 2]  # positive, neutral, negative, neutral, positive
    }
    
    df_sample = pd.DataFrame(sample_data)
    
    # Split into train/test (for demo purposes, using same data)
    train_df = df_sample.copy()
    test_df = df_sample.copy()
    
    # Run the pipeline
    try:
        # Test with different models
        models_to_test = [
            "gemini-1.5-pro-002",      # Best overall performance
            "gemini-1.5-flash-002",    # Faster inference
            # "text-bison@002"          # PaLM 2 alternative
        ]
        
        all_results = {}
        
        for model_name in models_to_test:
            print(f"\n{'='*20} Testing {model_name} {'='*20}")
            
            try:
                results, predictions = main_pipeline(train_df, test_df, model_name)
                all_results[model_name] = results
                
            except Exception as e:
                print(f"❌ Error with {model_name}: {str(e)}")
                continue
        
        # Compare models if multiple were tested
        if len(all_results) > 1:
            print(f"\n🏆 MODEL COMPARISON:")
            print(f"{'Model':<25} {'Accuracy':<10} {'F1-Macro':<10} {'F1-Weighted':<12}")
            print("-" * 60)
            
            for model_name, results in all_results.items():
                print(f"{model_name:<25} {results['accuracy']:<10.4f} "
                      f"{results['f1_macro']:<10.4f} {results['f1_weighted']:<12.4f}")
    
    except Exception as e:
        print(f"Pipeline error: {str(e)}")
        print("Please check your Vertex AI setup and authentication.")

print(" Vertex AI Multi-Task Sentiment Analysis Setup Complete!")
print("\n Next Steps:")
print("1. Set your PROJECT_ID variable")
print("2. Ensure Vertex AI API is enabled in your GCP project")
print("3. Set up authentication (gcloud auth application-default login)")
print("4. Replace sample data with your actual dataset")
print("5. Run the pipeline!")

In [None]:
# ============================================================================
# JAPANESE RAG WITH VERTEX AI
# ============================================================================

import os
import json
import logging
import asyncio
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import numpy as np
import pandas as pd
from datetime import datetime
import uuid

# Google Cloud imports
from google.cloud import aiplatform
from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
from google.cloud.aiplatform.gapic import IndexDatapoint
import vertexai
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel
from vertexai.generative_models import GenerativeModel

# Neo4j for GraphRAG
from neo4j import GraphDatabase
import neo4j

# Text processing for Japanese
import MeCab
import re
from janome.tokenizer import Tokenizer

# LangChain for RAG framework
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, PDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain.retrievers import VectorStoreRetriever
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ============================================================================
# 1. VERTEX AI CONFIGURATION
# ============================================================================

@dataclass
class VertexAIConfig:
    """Configuration for Vertex AI services"""
    project_id: str
    location: str = "us-central1"
    staging_bucket: str = None
    embedding_model: str = "textembedding-gecko"
    generative_model: str = "gemini-1.5-pro-001"
    
    def __post_init__(self):
        if not self.staging_bucket:
            self.staging_bucket = f"gs://{self.project_id}-rag-staging"
        
        # Initialize Vertex AI
        vertexai.init(project=self.project_id, location=self.location)
        aiplatform.init(project=self.project_id, location=self.location)

# ============================================================================
# 2. JAPANESE TEXT PREPROCESSING
# ============================================================================

class JapaneseTextProcessor:
    """Japanese text preprocessing and tokenization"""
    
    def __init__(self):
        # Initialize MeCab for advanced Japanese processing
        try:
            self.mecab = MeCab.Tagger('-Owakati')
        except:
            logger.warning("MeCab not available, using Janome fallback")
            self.janome = Tokenizer()
            self.mecab = None
    
    def clean_text(self, text: str) -> str:
        """Clean Japanese text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep Japanese punctuation
        text = re.sub(r'[^\w\s\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF\u3000-\u303F。、！？]', '', text)
        return text.strip()
    
    def tokenize(self, text: str) -> List[str]:
        """Tokenize Japanese text"""
        cleaned_text = self.clean_text(text)
        
        if self.mecab:
            tokens = self.mecab.parse(cleaned_text).strip().split()
        else:
            tokens = [token.surface for token in self.janome.tokenize(cleaned_text)]
        
        return [token for token in tokens if len(token) > 1]
    
    def extract_entities(self, text: str) -> List[Dict[str, str]]:
        """Extract named entities from Japanese text"""
        entities = []
        
        if self.mecab:
            # Use MeCab with detailed parsing
            mecab_detailed = MeCab.Tagger('-Ochasen')
            result = mecab_detailed.parse(text)
            
            for line in result.split('\n'):
                if line and line != 'EOS':
                    parts = line.split('\t')
                    if len(parts) >= 4:
                        surface = parts[0]
                        pos = parts[3] if len(parts) > 3 else ""
                        
                        # Extract proper nouns and other important entities
                        if '固有名詞' in pos or '名詞' in pos:
                            entities.append({
                                'text': surface,
                                'type': pos,
                                'start': text.find(surface),
                                'end': text.find(surface) + len(surface)
                            })
        
        return entities

# ============================================================================
# 3. VERTEX AI EMBEDDING SERVICE
# ============================================================================

class VertexAIEmbeddings(Embeddings):
    """Custom LangChain Embeddings class for Vertex AI"""
    
    def __init__(self, config: VertexAIConfig):
        self.config = config
        self.model = TextEmbeddingModel.from_pretrained(config.embedding_model)
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed multiple documents"""
        embeddings = []
        batch_size = 5  # Vertex AI rate limits
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            try:
                batch_embeddings = self.model.get_embeddings(batch)
                embeddings.extend([emb.values for emb in batch_embeddings])
            except Exception as e:
                logger.error(f"Error embedding batch {i//batch_size}: {e}")
                # Fallback to zero embeddings
                embeddings.extend([[0.0] * 768 for _ in batch])
        
        return embeddings
    
    def embed_query(self, text: str) -> List[float]:
        """Embed a single query"""
        try:
            embedding = self.model.get_embeddings([text])[0]
            return embedding.values
        except Exception as e:
            logger.error(f"Error embedding query: {e}")
            return [0.0] * 768

# ============================================================================
# 4. VERTEX AI VECTOR SEARCH INTEGRATION
# ============================================================================

class VertexAIVectorSearch:
    """Vertex AI Vector Search integration"""
    
    def __init__(self, config: VertexAIConfig):
        self.config = config
        self.index = None
        self.endpoint = None
        self.embeddings = VertexAIEmbeddings(config)
        
    def create_index(self, display_name: str, dimensions: int = 768):
        """Create a new Vector Search index"""
        try:
            self.index = MatchingEngineIndex.create_tree_ah_index(
                display_name=display_name,
                contents_delta_uri=f"{self.config.staging_bucket}/vector_index/",
                dimensions=dimensions,
                approximate_neighbors_count=150,
                leaf_node_embedding_count=500,
                leaf_nodes_to_search_percent=7,
                description="Japanese RAG Vector Index"
            )
            
            logger.info(f"Created index: {self.index.resource_name}")
            return self.index
            
        except Exception as e:
            logger.error(f"Error creating index: {e}")
            return None
    
    def deploy_index(self, endpoint_display_name: str):
        """Deploy index to an endpoint"""
        try:
            # Create endpoint
            self.endpoint = MatchingEngineIndexEndpoint.create(
                display_name=endpoint_display_name,
                public_endpoint_enabled=True
            )
            
            # Deploy index to endpoint
            self.endpoint.deploy_index(
                index=self.index,
                deployed_index_id="deployed_index_id"
            )
            
            logger.info(f"Index deployed to endpoint: {self.endpoint.resource_name}")
            return self.endpoint
            
        except Exception as e:
            logger.error(f"Error deploying index: {e}")
            return None
    
    def add_documents(self, documents: List[Document]):
        """Add documents to the vector index"""
        datapoints = []
        
        for i, doc in enumerate(documents):
            embedding = self.embeddings.embed_query(doc.page_content)
            
            datapoint = IndexDatapoint(
                datapoint_id=str(uuid.uuid4()),
                feature_vector=embedding,
                restricts=[],
                crowding_tag=""
            )
            datapoints.append(datapoint)
        
        # Batch upload to index
        if self.index:
            try:
                self.index.upsert_datapoints(datapoints)
                logger.info(f"Added {len(datapoints)} documents to index")
            except Exception as e:
                logger.error(f"Error adding documents: {e}")
    
    def search(self, query: str, top_k: int = 10) -> List[Dict]:
        """Search for similar documents"""
        if not self.endpoint:
            logger.error("Index endpoint not deployed")
            return []
        
        query_embedding = self.embeddings.embed_query(query)
        
        try:
            results = self.endpoint.find_neighbors(
                deployed_index_id="deployed_index_id",
                queries=[query_embedding],
                num_neighbors=top_k
            )
            
            return [{"id": neighbor.id, "distance": neighbor.distance} 
                   for neighbor in results[0]]
                   
        except Exception as e:
            logger.error(f"Error searching: {e}")
            return []

# ============================================================================
# 5. TRADITIONAL RAG IMPLEMENTATION
# ============================================================================

class JapaneseRAGSystem:
    """Traditional RAG system for Japanese documents"""
    
    def __init__(self, config: VertexAIConfig):
        self.config = config
        self.text_processor = JapaneseTextProcessor()
        self.embeddings = VertexAIEmbeddings(config)
        self.vectorstore = None
        self.retriever = None
        self.llm = self._initialize_llm()
        
    def _initialize_llm(self):
        """Initialize Vertex AI LLM"""
        return GenerativeModel(self.config.generative_model)
    
    def load_documents(self, file_paths: List[str]) -> List[Document]:
        """Load and process Japanese documents"""
        documents = []
        
        for file_path in file_paths:
            try:
                if file_path.endswith('.pdf'):
                    loader = PDFLoader(file_path)
                else:
                    loader = TextLoader(file_path, encoding='utf-8')
                
                docs = loader.load()
                
                # Process each document
                for doc in docs:
                    # Clean Japanese text
                    cleaned_content = self.text_processor.clean_text(doc.page_content)
                    doc.page_content = cleaned_content
                    documents.append(doc)
                    
            except Exception as e:
                logger.error(f"Error loading {file_path}: {e}")
        
        return documents
    
    def split_documents(self, documents: List[Document]) -> List[Document]:
        """Split documents into chunks suitable for Japanese text"""
        # Custom splitter for Japanese
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,  # Smaller chunks for Japanese
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", "。", "、", " ", ""]
        )
        
        return splitter.split_documents(documents)
    
    def build_vectorstore(self, documents: List[Document]):
        """Build vector store from documents"""
        logger.info(f"Building vector store with {len(documents)} documents...")
        
        # Create FAISS vector store
        self.vectorstore = FAISS.from_documents(
            documents=documents,
            embedding=self.embeddings
        )
        
        # Create retriever
        self.retriever = VectorStoreRetriever(
            vectorstore=self.vectorstore,
            search_kwargs={"k": 5}
        )
        
        logger.info("Vector store built successfully")
    
    def query(self, question: str, context_window: int = 5) -> Dict[str, Any]:
        """Query the RAG system"""
        if not self.retriever:
            raise ValueError("Vector store not built. Call build_vectorstore first.")
        
        # Retrieve relevant documents
        relevant_docs = self.retriever.get_relevant_documents(question)
        
        # Prepare context
        context = "\n\n".join([doc.page_content for doc in relevant_docs[:context_window]])
        
        # Create prompt for Japanese RAG
        prompt = f"""
以下の文書を参考にして、質問に答えてください。

文書:
{context}

質問: {question}

回答:
"""
        
        try:
            # Generate response
            response = self.llm.generate_content(prompt)
            
            return {
                "question": question,
                "answer": response.text,
                "sources": [{"content": doc.page_content, "metadata": doc.metadata} 
                           for doc in relevant_docs],
                "context": context
            }
            
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            return {
                "question": question,
                "answer": "申し訳ございませんが、回答を生成できませんでした。",
                "sources": [],
                "context": ""
            }


In [None]:
### GraphRAG

import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple

class GraphRAG:
    def __init__(self, embedding_model=None, llm=None):
        self.G = nx.Graph()
        self.embedding_model = embedding_model
        self.llm = llm
    
    def add_document(self, doc_id: int, text: str, embedding: np.ndarray = None):
        """Add a document node to the graph"""
        if embedding is None and self.embedding_model:
            # Generate embedding if not provided
            embedding = self.embedding_model.get_embeddings([text])[0].values
        
        self.G.add_node(doc_id, text=text, embedding=embedding)
    
    def add_relation(self, doc_id1: int, doc_id2: int, relation_type: str = "reference"):
        """Add an edge between two documents"""
        if doc_id1 in self.G.nodes and doc_id2 in self.G.nodes:
            self.G.add_edge(doc_id1, doc_id2, type=relation_type)
        else:
            raise ValueError(f"One or both document IDs ({doc_id1}, {doc_id2}) not found in graph")
    
    def find_similar_nodes(self, query_embedding: np.ndarray, top_k: int = 3) -> List[Tuple[int, float]]:
        """Find top-k most similar nodes to query embedding"""
        similarities = []
        
        for node_id in self.G.nodes():
            node_embedding = self.G.nodes[node_id]['embedding']
            if node_embedding is not None:
                # Calculate cosine similarity
                similarity = cosine_similarity(
                    query_embedding.reshape(1, -1), 
                    node_embedding.reshape(1, -1)
                )[0][0]
                similarities.append((node_id, similarity))
        
        # Sort by similarity (descending) and return top-k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]
    
    def get_subgraph_context(self, node_ids: List[int], max_hops: int = 1) -> str:
        """Get context from nodes and their neighbors within max_hops"""
        context_nodes = set(node_ids)
        
        # Add neighbors within max_hops
        for node_id in node_ids:
            if node_id in self.G.nodes:
                # Get neighbors within max_hops using BFS
                neighbors = nx.single_source_shortest_path_length(
                    self.G, node_id, cutoff=max_hops
                )
                context_nodes.update(neighbors.keys())
        
        # Collect text from all context nodes
        context_texts = []
        for node_id in context_nodes:
            if node_id in self.G.nodes and 'text' in self.G.nodes[node_id]:
                text = self.G.nodes[node_id]['text']
                context_texts.append(f"文書{node_id}: {text}")
        
        return "\n".join(context_texts)
    
    def query(self, query: str, top_k: int = 3, max_hops: int = 1) -> str:
        """Main query method for GraphRAG"""
        if not self.embedding_model or not self.llm:
            raise ValueError("Both embedding_model and llm must be provided")
        
        # 1. Get query embedding
        try:
            query_embedding = self.embedding_model.get_embeddings([query])[0].values
        except AttributeError:
            # Handle different embedding model interfaces
            query_embedding = self.embedding_model.encode([query])[0]
        
        # 2. Find similar nodes
        similar_nodes = self.find_similar_nodes(query_embedding, top_k)
        
        if not similar_nodes:
            return "関連する文書が見つかりませんでした。"
        
        # 3. Get subgraph context
        top_node_ids = [node_id for node_id, _ in similar_nodes]
        context = self.get_subgraph_context(top_node_ids, max_hops)
        
        # 4. Generate response using LLM
        prompt = f"""参考文書:
{context}

質問: {query}

上記の参考文書の内容に基づいて、質問に答えてください。

答え:"""
        
        try:
            response = self.llm.predict(prompt)
            return response.text if hasattr(response, 'text') else str(response)
        except Exception as e:
            return f"LLMによる回答生成中にエラーが発生しました: {str(e)}"


In [None]:
# Usage example
def example_usage():
    """Example of how to use the GraphRAG class"""
    
    # Initialize GraphRAG
    graph_rag = GraphRAG()  # You would pass your actual embedding_model and llm here
    
    # Sample embeddings (in practice, these would come from your embedding model)
    doc_embeddings = [
        np.random.rand(384),  # Example embedding dimension
        np.random.rand(384),
        np.random.rand(384)
    ]
    
    # Add documents
    graph_rag.add_document(0, "これは最初の文書です。人工知能について説明しています。", doc_embeddings[0])
    graph_rag.add_document(1, "これは二番目の文書です。機械学習について詳しく述べています。", doc_embeddings[1])
    graph_rag.add_document(2, "これは三番目の文書です。深層学習の応用について書かれています。", doc_embeddings[2])
    
    # Add relations
    graph_rag.add_relation(0, 1, "reference")
    graph_rag.add_relation(1, 2, "related")
    
    # Example query (you would need actual embedding_model and llm)
    query = "機械学習について教えてください"
    
    # For demonstration without actual models:
    print("GraphRAG setup completed!")
    print(f"Graph has {graph_rag.G.number_of_nodes()} nodes and {graph_rag.G.number_of_edges()} edges")
    
    # Show graph structure
    print("\nGraph structure:")
    for node in graph_rag.G.nodes(data=True):
        print(f"Node {node[0]}: {node[1]['text'][:50]}...")
    
    for edge in graph_rag.G.edges(data=True):
        print(f"Edge {edge[0]}-{edge[1]}: {edge[2]['type']}")

if __name__ == "__main__":
    example_usage()

In [None]:
# ============================================================================
# 6. GRAPHRAG IMPLEMENTATION
# ============================================================================

class JapaneseGraphRAG:
    """GraphRAG system for Japanese documents using Neo4j"""
    
    def __init__(self, config: VertexAIConfig, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
        self.config = config
        self.text_processor = JapaneseTextProcessor()
        self.embeddings = VertexAIEmbeddings(config)
        self.llm = GenerativeModel(config.generative_model)
        
        # Neo4j connection
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        
    def __del__(self):
        if hasattr(self, 'driver'):
            self.driver.close()
    
    def create_graph_from_documents(self, documents: List[Document]):
        """Create knowledge graph from Japanese documents"""
        with self.driver.session() as session:
            # Clear existing graph
            session.run("MATCH (n) DETACH DELETE n")
            
            for i, doc in enumerate(documents):
                logger.info(f"Processing document {i+1}/{len(documents)}")
                
                # Extract entities
                entities = self.text_processor.extract_entities(doc.page_content)
                
                # Create document node
                doc_embedding = self.embeddings.embed_query(doc.page_content)
                session.run("""
                    CREATE (d:Document {
                        id: $doc_id,
                        content: $content,
                        embedding: $embedding,
                        metadata: $metadata
                    })
                """, doc_id=f"doc_{i}", content=doc.page_content, 
                    embedding=doc_embedding, metadata=json.dumps(doc.metadata))
                
                # Create entity nodes and relationships
                for entity in entities:
                    entity_embedding = self.embeddings.embed_query(entity['text'])
                    
                    # Create entity node
                    session.run("""
                        MERGE (e:Entity {text: $text})
                        SET e.type = $type,
                            e.embedding = $embedding
                    """, text=entity['text'], type=entity['type'], embedding=entity_embedding)
                    
                    # Create relationship between document and entity
                    session.run("""
                        MATCH (d:Document {id: $doc_id})
                        MATCH (e:Entity {text: $entity_text})
                        CREATE (d)-[:CONTAINS]->(e)
                    """, doc_id=f"doc_{i}", entity_text=entity['text'])
                
                # Extract relationships between entities using LLM
                relationships = self._extract_relationships(doc.page_content, entities)
                
                for rel in relationships:
                    session.run("""
                        MATCH (e1:Entity {text: $entity1})
                        MATCH (e2:Entity {text: $entity2})
                        CREATE (e1)-[:RELATED {type: $rel_type, confidence: $confidence}]->(e2)
                    """, entity1=rel['entity1'], entity2=rel['entity2'], 
                        rel_type=rel['type'], confidence=rel['confidence'])
    
    def _extract_relationships(self, text: str, entities: List[Dict]) -> List[Dict]:
        """Extract relationships between entities using LLM"""
        if len(entities) < 2:
            return []
        
        entity_list = [e['text'] for e in entities[:10]]  # Limit to avoid token limits
        
        prompt = f"""
以下のテキストから、エンティティ間の関係を抽出してください。

テキスト: {text}

エンティティ: {', '.join(entity_list)}

関係を以下の形式で出力してください：
エンティティ1|関係の種類|エンティティ2|信頼度(0-1)

例：
東京|位置|日本|0.9
"""
        
        try:
            response = self.llm.generate_content(prompt)
            relationships = []
            
            for line in response.text.split('\n'):
                if '|' in line:
                    parts = line.strip().split('|')
                    if len(parts) >= 4:
                        relationships.append({
                            'entity1': parts[0],
                            'type': parts[1],
                            'entity2': parts[2],
                            'confidence': float(parts[3]) if parts[3].replace('.', '').isdigit() else 0.5
                        })
            
            return relationships
            
        except Exception as e:
            logger.error(f"Error extracting relationships: {e}")
            return []
    
    def hybrid_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Hybrid search combining vector similarity and graph traversal"""
        query_embedding = self.embeddings.embed_query(query)
        
        with self.driver.session() as session:
            # Vector similarity search on documents
            vector_results = session.run("""
                MATCH (d:Document)
                WITH d, gds.similarity.cosine(d.embedding, $query_embedding) AS similarity
                ORDER BY similarity DESC
                LIMIT $top_k
                RETURN d.id as doc_id, d.content as content, similarity, d.metadata as metadata
            """, query_embedding=query_embedding, top_k=top_k)
            
            # Graph-based search for entities
            entity_results = session.run("""
                MATCH (e:Entity)
                WITH e, gds.similarity.cosine(e.embedding, $query_embedding) AS similarity
                ORDER BY similarity DESC
                LIMIT $top_k
                MATCH (d:Document)-[:CONTAINS]->(e)
                RETURN DISTINCT d.id as doc_id, d.content as content, 
                       e.text as entity, e.type as entity_type, similarity
            """, query_embedding=query_embedding, top_k=top_k)
            
            # Combine results
            results = []
            
            for record in vector_results:
                results.append({
                    'doc_id': record['doc_id'],
                    'content': record['content'],
                    'similarity': record['similarity'],
                    'metadata': json.loads(record['metadata']),
                    'source': 'vector'
                })
            
            for record in entity_results:
                results.append({
                    'doc_id': record['doc_id'],
                    'content': record['content'],
                    'entity': record['entity'],
                    'entity_type': record['entity_type'],
                    'similarity': record['similarity'],
                    'source': 'graph'
                })
            
            # Sort by similarity and remove duplicates
            unique_results = {}
            for result in results:
                doc_id = result['doc_id']
                if doc_id not in unique_results or result['similarity'] > unique_results[doc_id]['similarity']:
                    unique_results[doc_id] = result
            
            return sorted(unique_results.values(), key=lambda x: x['similarity'], reverse=True)[:top_k]
    
    def query(self, question: str, top_k: int = 5) -> Dict[str, Any]:
        """Query the GraphRAG system"""
        # Get hybrid search results
        search_results = self.hybrid_search(question, top_k)
        
        # Prepare context from search results
        context_parts = []
        for result in search_results:
            context_parts.append(f"文書ID: {result['doc_id']}")
            context_parts.append(f"内容: {result['content'][:500]}...")
            if 'entity' in result:
                context_parts.append(f"関連エンティティ: {result['entity']} ({result['entity_type']})")
            context_parts.append("---")
        
        context = "\n".join(context_parts)
        
        # Create prompt for GraphRAG
        prompt = f"""
以下の文書とグラフ情報を参考にして、質問に答えてください。

参考情報:
{context}

質問: {question}

回答:
"""
        
        try:
            response = self.llm.generate_content(prompt)
            
            return {
                "question": question,
                "answer": response.text,
                "sources": search_results,
                "context": context
            }
            
        except Exception as e:
            logger.error(f"Error generating GraphRAG response: {e}")
            return {
                "question": question,
                "answer": "申し訳ございませんが、回答を生成できませんでした。",
                "sources": [],
                "context": ""
            }


In [None]:
# ============================================================================
# 7. MAIN PIPELINE CLASS
# ============================================================================

class JapaneseRAGPipeline:
    """Main pipeline for Japanese RAG and GraphRAG"""
    
    def __init__(self, config: VertexAIConfig, neo4j_config: Dict = None):
        self.config = config
        self.rag_system = JapaneseRAGSystem(config)
        
        if neo4j_config:
            self.graph_rag = JapaneseGraphRAG(
                config, 
                neo4j_config['uri'], 
                neo4j_config['user'], 
                neo4j_config['password']
            )
        else:
            self.graph_rag = None
    
    def setup_rag(self, document_paths: List[str]):
        """Setup traditional RAG system"""
        logger.info("Setting up RAG system...")
        
        # Load and process documents
        documents = self.rag_system.load_documents(document_paths)
        split_docs = self.rag_system.split_documents(documents)
        
        # Build vector store
        self.rag_system.build_vectorstore(split_docs)
        
        logger.info("RAG system setup complete")
    
    def setup_graph_rag(self, document_paths: List[str]):
        """Setup GraphRAG system"""
        if not self.graph_rag:
            raise ValueError("Neo4j configuration not provided")
        
        logger.info("Setting up GraphRAG system...")
        
        # Load documents
        documents = self.rag_system.load_documents(document_paths)
        split_docs = self.rag_system.split_documents(documents)
        
        # Create knowledge graph
        self.graph_rag.create_graph_from_documents(split_docs)
        
        logger.info("GraphRAG system setup complete")
    
    def query_rag(self, question: str) -> Dict[str, Any]:
        """Query traditional RAG system"""
        return self.rag_system.query(question)
    
    def query_graph_rag(self, question: str) -> Dict[str, Any]:
        """Query GraphRAG system"""
        if not self.graph_rag:
            raise ValueError("GraphRAG not initialized")
        return self.graph_rag.query(question)
    
    def compare_systems(self, question: str) -> Dict[str, Any]:
        """Compare RAG and GraphRAG responses"""
        rag_result = self.query_rag(question)
        graph_rag_result = self.query_graph_rag(question) if self.graph_rag else None
        
        return {
            "question": question,
            "rag_response": rag_result,
            "graph_rag_response": graph_rag_result,
            "timestamp": datetime.now().isoformat()
        }

# ============================================================================
# 8. EXAMPLE USAGE
# ============================================================================

def example_usage():
    """Example of how to use the Japanese RAG/GraphRAG systems"""
    
    # Configuration
    config = VertexAIConfig(
        project_id="your-project-id",
        location="us-central1",
        embedding_model="textembedding-gecko",
        generative_model="gemini-1.5-pro-001"
    )
    
    # Neo4j configuration (optional for GraphRAG)
    neo4j_config = {
        "uri": "bolt://localhost:7687",  # or Neo4j Aura URI
        "user": "neo4j",
        "password": "your-password"
    }
    
    # Initialize pipeline
    pipeline = JapaneseRAGPipeline(config, neo4j_config)
    
    # Document paths (support .txt, .pdf files)
    document_paths = [
        "japanese_document1.txt",
        "japanese_document2.pdf",
        # Add your Japanese document paths here
    ]
    
    # Setup systems
    pipeline.setup_rag(document_paths)
    pipeline.setup_graph_rag(document_paths)  # Optional
    
    # Query examples
    questions = [
        "この文書の主要なテーマは何ですか？",
        "登場人物の関係性について教えてください",
        "重要な出来事はいつ起こりましたか？"
    ]
    
    for question in questions:
        print(f"\n質問: {question}")
        
        # Traditional RAG
        rag_result = pipeline.query_rag(question)
        print(f"RAG回答: {rag_result['answer']}")
        
        # GraphRAG (if available)
        if pipeline.graph_rag:
            graph_result = pipeline.query_graph_rag(question)
            print(f"GraphRAG回答: {graph_result['answer']}")
        
        print("-" * 80)

# Uncomment to run example
# example_usage()

# ============================================================================
# 9. DEPLOYMENT UTILITIES
# ============================================================================

class RAGDeployment:
    """Utilities for deploying RAG systems to production"""
    
    @staticmethod
    def create_vertex_ai_endpoint(config: VertexAIConfig, model_name: str):
        """Create Vertex AI endpoint for RAG service"""
        # This would typically involve containerizing the RAG service
        # and deploying it to Vertex AI Endpoints
        pass
    
    @staticmethod
    def setup_monitoring(config: VertexAIConfig):
        """Setup monitoring for RAG system"""
        # Setup Cloud Monitoring for the RAG system
        pass
    
    @staticmethod
    def create_api_gateway(config: VertexAIConfig):
        """Create API Gateway for RAG service"""
        # Setup API Gateway to expose RAG endpoints
        pass

# Example production setup
def production_setup():
    """Example production setup"""
    config = VertexAIConfig(
        project_id="your-production-project",
        location="asia-northeast1",  # Tokyo region for Japanese content
        staging_bucket="gs://your-production-bucket"
    )
    
    # Setup production RAG pipeline
    pipeline = JapaneseRAGPipeline(config)
    
    # Additional production configurations would go here
    return pipeline

In [None]:
# ============================================================================
# 9. DEPLOYMENT UTILITIES (COMPLETE)
# ============================================================================

from google.cloud import aiplatform
from google.cloud import monitoring_v3
from google.cloud import apigateway_v1

class RAGDeployment:
    """Utilities for deploying RAG systems to production on Vertex AI."""

    @staticmethod
    def create_vertex_ai_endpoint(config: VertexAIConfig, model_display_name: str, serving_image_uri: str):
        """
        Deploy a custom model (container) to Vertex AI Endpoint.
        serving_image_uri: Path to container image in GCR/AR (Artifact Registry)
        """
        aiplatform.init(project=config.project_id, location=config.location, staging_bucket=config.staging_bucket)

        # Model upload (Container)
        model = aiplatform.Model.upload(
            display_name=model_display_name,
            serving_container_image_uri=serving_image_uri,
            artifact_uri=None,
            sync=True
        )
        logger.info(f"Model uploaded: {model.resource_name}")

        # Endpoint creation
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_display_name}-endpoint",
            sync=True
        )
        logger.info(f"Endpoint created: {endpoint.resource_name}")

        # Deploy model to endpoint
        endpoint.deploy(
            model=model,
            deployed_model_display_name=f"{model_display_name}-deployed",
            machine_type="n1-standard-4",  # or "a2-highgpu-1g" for GPU
            traffic_split={"0": 100}
        )
        logger.info(f"Model deployed to endpoint: {endpoint.resource_name}")
        return endpoint

    @staticmethod
    def setup_monitoring(config: VertexAIConfig):
        """
        Setup Cloud Monitoring: Example is a placeholder for custom metric setup.
        """
        client = monitoring_v3.MetricServiceClient()
        project_name = f"projects/{config.project_id}"
        logger.info(f"Monitoring client set up for project {project_name}")
        # You can set up custom metrics, alerting policies, etc.
        # See https://cloud.google.com/monitoring/docs

    @staticmethod
    def create_api_gateway(config: VertexAIConfig, api_name: str):
        """
        Create API Gateway endpoint (stub; setup via Cloud Console or IaC tools).
        """
        client = apigateway_v1.ApiGatewayServiceClient()
        parent = f"projects/{config.project_id}/locations/global"
        api = apigateway_v1.Api(
            display_name=api_name
        )
        operation = client.create_api(parent=parent, api_id=api_name, api=api)
        logger.info(f"API Gateway creation initiated: {api_name}")
        # Complete this with OpenAPI spec for routing REST to Vertex AI endpoint.

# ============================================================================
# 10. EXAMPLE FASTAPI ENDPOINT FOR ONLINE INFERENCE
# ============================================================================

from fastapi import FastAPI, Request
from pydantic import BaseModel
import uvicorn

app = FastAPI()

class QueryRequest(BaseModel):
    question: str

# Instantiate your pipeline globally for API
pipeline = None  # Will be assigned in startup event

@app.on_event("startup")
def startup_event():
    global pipeline
    # Adjust config as needed for prod
    config = VertexAIConfig(
        project_id="your-production-project",
        location="asia-northeast1",
        staging_bucket="gs://your-production-bucket"
    )
    pipeline = JapaneseRAGPipeline(config)

@app.post("/rag")
async def rag_endpoint(query: QueryRequest):
    """REST API endpoint for traditional RAG"""
    result = pipeline.query_rag(query.question)
    return result

@app.post("/graph_rag")
async def graph_rag_endpoint(query: QueryRequest):
    """REST API endpoint for GraphRAG"""
    result = pipeline.query_graph_rag(query.question)
    return result

# Example: run the API locally for testing
# if __name__ == "__main__":
#     uvicorn.run(app, host="0.0.0.0", port=8000)

# ============================================================================
# 11. PRODUCTION SETUP & CHECKLIST (TIPS)
# ============================================================================

def production_setup():
    """
    Example production setup for Japanese RAG/GraphRAG pipeline.
    """
    config = VertexAIConfig(
        project_id="your-production-project",
        location="asia-northeast1",  # Tokyo region for Japanese content
        staging_bucket="gs://your-production-bucket"
    )
    # You may load Neo4j config from env/secret manager
    neo4j_config = {
        "uri": "bolt://<YOUR_NEO4J_HOST>:7687",
        "user": "neo4j",
        "password": "<your-password>"
    }
    pipeline = JapaneseRAGPipeline(config, neo4j_config)
    return pipeline

# ============================================================================
# 12. INFERENCE FROM JUPYTER OR OTHER CLIENTS
# ============================================================================

import requests

def query_api(question: str, endpoint_url: str = "http://localhost:8000/rag"):
    """Call REST API for RAG pipeline"""
    resp = requests.post(endpoint_url, json={"question": question})
    return resp.json()

# ============================================================================
# 13. NOTES FOR PRODUCTIONIZATION
# ============================================================================
"""
- For **Vertex AI Endpoints**, you’ll usually build a Docker image serving your FastAPI app (or Flask, etc) and deploy it as a custom model endpoint.  
- For **RAG pipelines**, you might precompute and store vector indices, and load at startup for best performance.
- Use **Vertex AI Vector Search** or **FAISS** (in-memory, for small scale) as vector backend.
- Use **Cloud Scheduler**, **Pub/Sub**, or **Workflows** for retraining, batch jobs, or updating vector/graph indices.
- For **API Gateway**, use OpenAPI spec to proxy public requests to Vertex AI endpoints.
- Consider **GCP Secret Manager** for sensitive credentials (Neo4j, API keys).
- Use **Google Cloud Monitoring** for log/metric aggregation, alerting.
- For scale: use "a2-highgpu-1g" machines for LLM endpoints, "n2-standard-16" for vector/embedding-heavy workloads.
"""

# ============================================================================
# END OF PIPELINE
# ============================================================================

