# VertexRec Pipeline Demo

This notebook demonstrates the VertexRec ML pipeline execution, including data validation, feature engineering, model training, and evaluation.

## Table of Contents
1. [Pipeline Setup](#pipeline-setup)
2. [Data Validation](#data-validation)
3. [Feature Engineering](#feature-engineering)
4. [Model Training](#model-training)
5. [Model Evaluation](#model-evaluation)
6. [Pipeline Results](#pipeline-results)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add pipeline modules to path
sys.path.append('../pipelines')
sys.path.append('../scripts')

print("Libraries imported successfully!")


## 1. Pipeline Setup {#pipeline-setup}


In [None]:
# Set up pipeline configuration
PROJECT_ID = "your-gcp-project-id"  # Replace with your project ID
REGION = "us-central1"
BUCKET_NAME = f"vertexrec-data-{PROJECT_ID}"

# Data paths
DATA_DIR = Path("../data")
USERS_DATA = DATA_DIR / "users.csv"
ITEMS_DATA = DATA_DIR / "items.csv"
INTERACTIONS_DATA = DATA_DIR / "interactions.csv"

# Output directories
OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(exist_ok=True)

print("Pipeline configuration set up successfully!")
print(f"Project ID: {PROJECT_ID}")
print(f"Region: {REGION}")
print(f"Bucket: {BUCKET_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


## 2. Data Validation {#data-validation}


In [None]:
# Run data validation
from data_validation.validate_data import DataValidator

# Initialize validator
validator = DataValidator(str(OUTPUT_DIR / "validation"))

# Validate each dataset
print("Validating users data...")
users_valid, users_anomalies = validator.validate_users_data(str(USERS_DATA))

print("Validating items data...")
items_valid, items_anomalies = validator.validate_items_data(str(ITEMS_DATA))

print("Validating interactions data...")
interactions_valid, interactions_anomalies = validator.validate_interactions_data(str(INTERACTIONS_DATA))

# Print validation results
print(f"\n=== VALIDATION RESULTS ===")
print(f"Users data: {'‚úì Valid' if users_valid else '‚úó Invalid'}")
print(f"Items data: {'‚úì Valid' if items_valid else '‚úó Invalid'}")
print(f"Interactions data: {'‚úì Valid' if interactions_valid else '‚úó Invalid'}")

if not all([users_valid, items_valid, interactions_valid]):
    print("\nValidation failed! Check anomaly files for details.")
else:
    print("\nAll data validation passed successfully!")


## 3. Feature Engineering {#feature-engineering}


In [None]:
# Run feature engineering
from feature_engineering.feature_engineering import FeatureEngineer

# Initialize feature engineer
feature_engineer = FeatureEngineer(str(OUTPUT_DIR / "features"))

# Load data
print("Loading data for feature engineering...")
users_df, items_df, interactions_df = feature_engineer.load_data(
    str(USERS_DATA), str(ITEMS_DATA), str(INTERACTIONS_DATA)
)

# Engineer features
print("Engineering user features...")
user_features = feature_engineer.engineer_user_features(users_df, interactions_df)

print("Engineering item features...")
item_features = feature_engineer.engineer_item_features(items_df, interactions_df)

print("Engineering interaction features...")
interaction_features = feature_engineer.engineer_interaction_features(
    interactions_df, users_df, items_df
)

# Save features
print("Saving engineered features...")
feature_engineer.save_features(user_features, item_features, interaction_features)

# Display feature summary
summary = feature_engineer.create_feature_summary(user_features, item_features, interaction_features)

print(f"\n=== FEATURE ENGINEERING RESULTS ===")
print(f"User features: {summary['user_features']['count']} features")
print(f"Item features: {summary['item_features']['count']} features")
print(f"Interaction features: {summary['interaction_features']['count']} features")

print("\nSample user features:")
display(user_features.head())


## 4. Model Training {#model-training}


In [None]:
# Train TF Recommenders model
from training.tf_recommenders_trainer import TFRecommendersTrainer

print("Training TF Recommenders model...")
tf_trainer = TFRecommendersTrainer(
    output_dir=str(OUTPUT_DIR / "models" / "tf_recommenders"),
    embedding_dim=32,  # Smaller for demo
    learning_rate=0.01
)

# Train model
tf_model = tf_trainer.train(
    interactions_path=str(INTERACTIONS_DATA),
    users_path=str(USERS_DATA),
    items_path=str(ITEMS_DATA),
    epochs=5,  # Fewer epochs for demo
    validation_split=0.2
)

print("TF Recommenders model training completed!")


In [None]:
# Train XGBoost model
from training.xgboost_trainer import XGBoostRankingTrainer

print("Training XGBoost model...")
xgb_trainer = XGBoostRankingTrainer(
    output_dir=str(OUTPUT_DIR / "models" / "xgboost")
)

# Model parameters for demo
model_params = {
    'n_estimators': 50,  # Fewer trees for demo
    'max_depth': 4,
    'learning_rate': 0.1
}

# Train model
xgb_model = xgb_trainer.train(
    interactions_path=str(INTERACTIONS_DATA),
    users_path=str(USERS_DATA),
    items_path=str(ITEMS_DATA),
    validation_split=0.2,
    model_params=model_params
)

print("XGBoost model training completed!")


## 5. Model Evaluation {#model-evaluation}


In [None]:
# Evaluate models
from evaluation.evaluation_metrics import RecommendationEvaluator

print("Evaluating models...")
evaluator = RecommendationEvaluator(str(OUTPUT_DIR / "evaluation"))

# Load data
interactions_df, users_df, items_df = evaluator.load_data(
    str(INTERACTIONS_DATA), str(USERS_DATA), str(ITEMS_DATA)
)

# Split data
train_df, test_df = evaluator.split_data(interactions_df)

# Create dummy recommendations for demo (in real pipeline, use trained models)
print("Generating recommendations for evaluation...")
recommendations = {}
for user_id in test_df['user_id'].unique()[:100]:  # Limit for demo
    recommendations[user_id] = test_df[
        test_df['user_id'] != user_id
    ]['item_id'].unique()[:10].tolist()

# Evaluate recommendations
results = evaluator.evaluate_recommendations(
    recommendations, test_df, items_df, len(items_df), k_values=[5, 10]
)

# Save results
evaluator.save_results(results)

# Generate report
report = evaluator.generate_report(results)
print(report)


## 6. Pipeline Results {#pipeline-results}


In [None]:
# Display pipeline results summary
print("=== VERTEXREC PIPELINE DEMO COMPLETED ===")
print("\nPipeline Components:")
print("‚úì Data Validation - All datasets validated successfully")
print("‚úì Feature Engineering - User, item, and interaction features created")
print("‚úì Model Training - TF Recommenders and XGBoost models trained")
print("‚úì Model Evaluation - Comprehensive evaluation metrics computed")

print(f"\nOutput Files Generated:")
print(f"üìÅ {OUTPUT_DIR / 'validation'} - Data validation results")
print(f"üìÅ {OUTPUT_DIR / 'features'} - Engineered features")
print(f"üìÅ {OUTPUT_DIR / 'models'} - Trained models")
print(f"üìÅ {OUTPUT_DIR / 'evaluation'} - Evaluation results")

print(f"\nKey Metrics:")
print(f"üìä Recall@10: {results.get('recall@10', 'N/A'):.4f}")
print(f"üìä NDCG@10: {results.get('ndcg@10', 'N/A'):.4f}")
print(f"üìä MRR: {results.get('mrr', 'N/A'):.4f}")
print(f"üìä Coverage: {results.get('coverage', 'N/A'):.4f}")

print(f"\nNext Steps:")
print("1. Deploy models to Vertex AI Endpoints")
print("2. Set up Cloud Run API service")
print("3. Configure monitoring and alerting")
print("4. Implement CI/CD pipeline")
print("5. Deploy to production environment")

print(f"\nüéâ Pipeline demo completed successfully!")
