In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import numpy as np
from sklearn.model_selection import KFold
import os


In [2]:
test_df = pd.read_csv('/Users/shash/github/sms/data/gemini_embeds/test_embeds.csv')
train1_df = pd.read_csv('/Users/shash/github/sms/data/gemini_embeds/train_embeds.csv')
val_df = pd.read_csv('/Users/shash/github/sms/data/gemini_embeds/val_embeds.csv')

train_df = pd.concat([train1_df, val_df])


In [3]:
def parse_embedding_string(embedding_str):
    """Parse embedding string to numpy array"""
    if isinstance(embedding_str, str):
        # Remove brackets and split by comma
        values = embedding_str.strip('[]').split(',')
        # Convert to floats
        return np.array([float(x.strip()) for x in values])
    return np.array(embedding_str)

def preprocess_data(df, is_training=True):
    """
    Preprocess the data by converting embeddings to numpy arrays
    """
    # Convert embeddings to numpy arrays
    X = np.array([parse_embedding_string(emb) for emb in df['Embeddings']])
    
    if is_training and 'EventType' in df.columns:
        y = df['EventType'].values
    else:
        y = None
        
    return X, y


In [4]:
def create_attention_model(embedding_dim):
    """Create attention model for Gemini embeddings."""
    # Input layer
    inputs = layers.Input(shape=(embedding_dim,))
    
    # Project input to attention space
    attention = layers.Dense(embedding_dim, use_bias=False)(inputs)
    attention_weights = layers.Activation('softmax')(attention)
    
    # Apply attention weights
    attended = layers.Multiply()([inputs, attention_weights])
    
    # Dense layers for classification
    x = layers.Dense(2048, activation='relu')(attended)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    return Model(inputs=inputs, outputs=outputs)

def get_callbacks(model_prefix):
    """Create callbacks for training."""
    os.makedirs('models/gemini_attention', exist_ok=True)
    
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=0.5,
            patience=3,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            f'models/gemini_attention/{model_prefix}_best_model.keras',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

In [5]:
def train_ensemble_model(train_df, n_splits=5, epochs=50, batch_size=32):
    """Train multiple models using cross-validation based on MatchID."""
    # Get unique MatchIDs
    unique_matches = train_df['MatchID'].unique()
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Lists to store models and histories
    models = []
    histories = []
    fold_metrics = []
    
    # Get embedding dimension from first sample
    first_embedding = parse_embedding_string(train_df['Embeddings'].iloc[0])
    embedding_dim = len(first_embedding)
    print(f"Embedding dimension: {embedding_dim}")
    
    # Train models
    for fold, (train_idx, val_idx) in enumerate(kf.split(unique_matches)):
        print(f"\nTraining fold {fold + 1}/{n_splits}")
        
        # Split data based on MatchID
        train_matches = unique_matches[train_idx]
        val_matches = unique_matches[val_idx]
        
        train_data = train_df[train_df['MatchID'].isin(train_matches)]
        val_data = train_df[train_df['MatchID'].isin(val_matches)]
        
        # Preprocess data
        X_train, y_train = preprocess_data(train_data)
        X_val, y_val = preprocess_data(val_data)
        
        print(f"Training data shape: {X_train.shape}")
        print(f"Validation data shape: {X_val.shape}")
        
        # Create and compile model
        model = create_attention_model(embedding_dim)
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC()]
        )
        
        # Train model
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=get_callbacks(f'fold_{fold}'),
            verbose=1
        )
        
        # Evaluate on validation set
        val_metrics = model.evaluate(X_val, y_val, verbose=0)
        fold_metrics.append({
            'val_loss': val_metrics[0],
            'val_accuracy': val_metrics[1],
            'val_auc': val_metrics[2]
        })
        
        models.append(model)
        histories.append(history.history)
    
    return models, histories, fold_metrics

import pandas as pd

def save_fold_predictions(models, data_df, output_path, is_train=True):
    """
    Generate and save predictions from each fold model.
    
    Args:
        models: List of trained attention models
        data_df: DataFrame with ID, Embeddings, and MatchID columns
        output_path: Path to save the predictions
        is_train: Boolean indicating if this is training data
    """
    # Group data by period
    grouped_data = data_df.groupby('ID').agg({
        'Embeddings': 'first',  # Take first embedding since they should be the same per ID
        'MatchID': 'first',
        'PeriodID': 'first'
    })
    
    if is_train and 'EventType' in data_df.columns:
        grouped_data['EventType'] = data_df.groupby('ID')['EventType'].first()
    
    grouped_data = grouped_data.reset_index()
    
    # Get predictions from each model
    all_preds = pd.DataFrame({'ID': grouped_data['ID']})
    all_preds['PeriodID'] = grouped_data['PeriodID'].astype(int)
    
    # Preprocess data once for all models
    X, _ = preprocess_data(grouped_data, is_training=False)
    
    # Get predictions from each model
    for i, model in enumerate(models):
        preds = model.predict(X)
        all_preds[f'model_{i}_pred'] = preds.flatten()
    
    # Add true labels for training data
    if is_train and 'EventType' in data_df.columns:
        all_preds['EventType'] = grouped_data['EventType']
    
    # Calculate ensemble predictions
    pred_columns = [col for col in all_preds.columns if col.endswith('_pred')]
    all_preds['ensemble_pred'] = all_preds[pred_columns].mean(axis=1)
    
    # Save predictions
    all_preds.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")
    
    return all_preds

def predict_with_ensemble(models, test_df):
    """Make predictions using the ensemble of models."""
    # Preprocess test data
    X_test, _ = preprocess_data(test_df, is_training=False)
    
    # Get predictions from each model
    predictions = []
    for model in models:
        pred = model.predict(X_test)
        predictions.append(pred)
    
    # Average predictions
    ensemble_pred = np.mean(predictions, axis=0)
    return ensemble_pred

In [6]:
# Train the models
models, histories, fold_metrics = train_ensemble_model(
    train_df=train_df,
    n_splits=5,
    epochs=50,
    batch_size=32
)

# Print average metrics across folds
avg_metrics = {
    metric: np.mean([fold[metric] for fold in fold_metrics])
    for metric in fold_metrics[0].keys()
}
print("\nAverage metrics across folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

# For predictions on new data:
# predictions = predict_with_ensemble(models, test_df)


Embedding dimension: 768

Training fold 1/5
Training data shape: (1560, 768)
Validation data shape: (487, 768)
Epoch 1/50
[1m44/49[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 6ms/step - accuracy: 0.5235 - auc: 0.5257 - loss: 0.8750
Epoch 1: val_accuracy improved from -inf to 0.56879, saving model to models/gemini_attention/fold_0_best_model.keras
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5243 - auc: 0.5284 - loss: 0.8734 - val_accuracy: 0.5688 - val_auc: 0.5000 - val_loss: 0.6891 - learning_rate: 0.0010
Epoch 2/50
[1m45/49[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 6ms/step - accuracy: 0.5902 - auc: 0.6288 - loss: 0.7712
Epoch 2: val_accuracy did not improve from 0.56879
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5906 - auc: 0.6291 - loss: 0.7699 - val_accuracy: 0.5688 - val_auc: 0.5000 - val_loss: 0.6907 - learning_rate: 0.0010
Epoch 3/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━

In [7]:
# For training data
train_predictions = save_fold_predictions(
    models=models,
    data_df=train_df,
    output_path='final_features/gemini_attention/train_predictions.csv',
    is_train=True
)

# For test data
test_predictions = save_fold_predictions(
    models=models,
    data_df=test_df,
    output_path='final_features/gemini_attention/test_predictions.csv',
    is_train=False
)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Predictions saved to final_features/gemini_attention/train_predictions.csv
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Predictions saved to final_features/gemini_attention/test_predictions.csv
