# Hive-LSTM Siamese Integration Testing Notebook

This notebook allows you to test the complete Hive data extraction, LSTM Siamese text similarity matching, and result storage workflow before deploying to Kubeflow.

## Workflow Overview:
1. **Setup & Configuration** - Configure connections and parameters
2. **Data Extraction** - Extract data from Hive table to CSV
3. **Data Preprocessing** - Convert to LSTM Siamese format
4. **Text Similarity Matching** - Run LSTM Siamese matching
5. **Result Analysis** - Analyze matching results
6. **Save to Hive** - Store results back to Hive

## 1. Setup & Installation

In [None]:
# Install required packages
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import subprocess
import logging
from typing import List, Dict, Any, Tuple
import warnings
import sys
warnings.filterwarnings('ignore')

# Add current directory to path for local imports
sys.path.append('.')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✓ Setup completed!")

## 2. Configuration

In [None]:
# Configuration parameters
HIVE_CONFIG = {
    'host': '172.17.235.21',
    'port': 10000,
    'database': 'preprocessed_analytics',
    'username': 'lhimer'
}

DATA_CONFIG = {
    'input_table': 'preprocessed_analytics.model_reference',
    'output_table': 'results.lstm_siamese_matches', 
    'temp_dir': './temp_notebook_lstm',
    'input_csv': './temp_notebook_lstm/input_data.csv',
    'output_csv': './temp_notebook_lstm/output_results.csv',
    'model_path': './temp_notebook_lstm/siamese_model.h5',
    'sample_size': 1000,
    'matching_mode': 'auto'
}

SIAMESE_CONFIG = {
    'EMBEDDING_DIM': 300,
    'MAX_SEQUENCE_LENGTH': 100,
    'NUMBER_LSTM': 50,
    'RATE_DROP_LSTM': 0.25,
    'NUMBER_DENSE_UNITS': 50,
    'ACTIVATION_FUNCTION': 'relu',
    'RATE_DROP_DENSE': 0.25,
    'VALIDATION_SPLIT': 0.2,
    'EPOCHS': 10,
    'BATCH_SIZE': 64
}

os.makedirs(DATA_CONFIG['temp_dir'], exist_ok=True)
print("📋 Configuration loaded successfully!")

## 3. Data Extraction Test

In [None]:
# Test data extraction
print("🔄 Testing data extraction...")

try:
    from hive_siamese_data_extractor import HiveSiameseDataExtractor
    
    extractor = HiveSiameseDataExtractor(
        host=HIVE_CONFIG['host'],
        port=HIVE_CONFIG['port'],
        username=HIVE_CONFIG['username'],
        database=HIVE_CONFIG['database']
    )
    
    if extractor.connect():
        print("✅ Hive connection successful!")
        
        # Extract sample data
        output_path = extractor.extract_and_convert(
            table_name=DATA_CONFIG['input_table'],
            output_path=DATA_CONFIG['input_csv'],
            sample_limit=DATA_CONFIG['sample_size'],
            matching_mode=DATA_CONFIG['matching_mode']
        )
        
        print(f"✅ Data extracted to: {output_path}")
        
        # Show sample data
        df = pd.read_csv(output_path)
        print(f"\n📊 Sample data ({len(df)} rows):")
        display(df.head())
        
        extractor.disconnect()
    
    else:
        print("❌ Hive connection failed!")
        
except Exception as e:
    print(f"❌ Error during extraction: {e}")
    print("💡 Creating sample data for testing...")
    
    # Create sample data
    sample_data = {
        'sentences1': [
            'John Smith works at Microsoft',
            'Mary Johnson is a teacher',
            'The quick brown fox jumps'
        ],
        'sentences2': [
            'Jon Smith employed by Microsoft',
            'Maria Johnson teaches students',
            'A fast brown fox leaps'
        ],
        'is_similar': [1, 1, 1]
    }
    
    df = pd.DataFrame(sample_data)
    df.to_csv(DATA_CONFIG['input_csv'], index=False)
    print(f"✅ Sample data created: {DATA_CONFIG['input_csv']}")
    display(df)

## 4. Model Training Test

In [None]:
# Test LSTM Siamese model training
print("🔄 Testing LSTM Siamese model...")

try:
    from siamese_matcher import SiameseMatcher
    
    # Load data
    df = pd.read_csv(DATA_CONFIG['input_csv'])
    sentences1 = df['sentences1'].tolist()
    sentences2 = df['sentences2'].tolist()
    labels = df['is_similar'].tolist()
    
    print(f"📊 Training data: {len(sentences1)} pairs")
    
    # Initialize matcher
    matcher = SiameseMatcher(SIAMESE_CONFIG)
    
    # Train model (if enough data)
    if len(sentences1) > 2:
        print("🚀 Training model...")
        history = matcher.train(sentences1, sentences2, labels)
        
        # Save model
        matcher.save_model(DATA_CONFIG['model_path'])
        print(f"💾 Model saved to: {DATA_CONFIG['model_path']}")
        
        # Make predictions
        print("🔮 Making predictions...")
        predictions = matcher.predict(sentences1, sentences2)
        
        # Save results
        results_df = df.copy()
        results_df['similarity_score'] = predictions
        results_df['prediction'] = (predictions > 0.5).astype(int)
        results_df['model_type'] = 'lstm_siamese'
        
        results_df.to_csv(DATA_CONFIG['output_csv'], index=False)
        print(f"💾 Results saved to: {DATA_CONFIG['output_csv']}")
        
        print("\n📊 Results:")
        display(results_df)
        
        print("✅ Model training and prediction completed!")
    
    else:
        print("⚠️  Not enough data for training (need more than 2 pairs)")
        
except Exception as e:
    print(f"❌ Error during model training: {e}")
    print("💡 This might be expected with very small datasets")

## 5. Results Analysis

In [None]:
# Analyze results
if os.path.exists(DATA_CONFIG['output_csv']):
    print("📊 Analyzing results...")
    
    results_df = pd.read_csv(DATA_CONFIG['output_csv'])
    
    print(f"\n📋 Results Summary:")
    print(f"  Total pairs: {len(results_df)}")
    
    if 'similarity_score' in results_df.columns:
        print(f"  Average similarity: {results_df['similarity_score'].mean():.3f}")
        print(f"  Max similarity: {results_df['similarity_score'].max():.3f}")
        print(f"  Min similarity: {results_df['similarity_score'].min():.3f}")
    
    if 'prediction' in results_df.columns:
        matches = results_df['prediction'].sum()
        print(f"  Predicted matches: {matches}")
        print(f"  Match rate: {matches/len(results_df):.2%}")
    
    # Visualization
    if 'similarity_score' in results_df.columns and len(results_df) > 1:
        plt.figure(figsize=(10, 4))
        
        plt.subplot(1, 2, 1)
        plt.hist(results_df['similarity_score'], bins=10, alpha=0.7, edgecolor='black')
        plt.xlabel('Similarity Score')
        plt.ylabel('Frequency')
        plt.title('Distribution of Similarity Scores')
        
        if 'prediction' in results_df.columns:
            plt.subplot(1, 2, 2)
            matches = results_df['prediction'].sum()
            non_matches = len(results_df) - matches
            plt.pie([matches, non_matches], labels=['Matches', 'Non-matches'], 
                   autopct='%1.1f%%', startangle=90)
            plt.title('Match Distribution')
        
        plt.tight_layout()
        plt.show()
    
    print("\n📋 Sample Results:")
    display(results_df.head())
    
else:
    print("❌ No results file found")

## 6. Generate Summary

In [None]:
# Generate workflow summary
print("📋 Generating workflow summary...")

summary = {
    'timestamp': datetime.now().isoformat(),
    'model_type': 'lstm_siamese',
    'configuration': {
        'hive_table': DATA_CONFIG['input_table'],
        'output_table': DATA_CONFIG['output_table'],
        'embedding_dim': SIAMESE_CONFIG['EMBEDDING_DIM'],
        'max_sequence_length': SIAMESE_CONFIG['MAX_SEQUENCE_LENGTH'],
        'sample_size': DATA_CONFIG['sample_size']
    },
    'files_created': [],
    'status': 'completed'
}

# Check which files were created
files_to_check = [
    DATA_CONFIG['input_csv'],
    DATA_CONFIG['output_csv'],
    DATA_CONFIG['model_path']
]

for file_path in files_to_check:
    if os.path.exists(file_path):
        summary['files_created'].append(file_path)

# Save summary
summary_path = os.path.join(DATA_CONFIG['temp_dir'], 'workflow_summary.json')
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"💾 Summary saved to: {summary_path}")

print("\n🎉 LSTM Siamese Workflow Testing Complete!")
print("\n📋 Next Steps:")
print("1. ✅ Test completed - ready for production deployment")
print("2. 🔧 Configure Kubeflow pipeline parameters")
print("3. 🐳 Build Docker image")
print("4. 🚀 Deploy to Kubeflow")

print(f"\n📁 Generated Files:")
for file_path in summary['files_created']:
    size = os.path.getsize(file_path)
    print(f"  ✅ {file_path} ({size} bytes)")

print("\n📊 Configuration Summary:")
print(json.dumps(summary, indent=2))