In [10]:
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import pandas as pd
import numpy as np

In [None]:
train_word_glove_attention = pd.read_csv('final_features/word_attention_glove/train_predictions.csv')
test_word_glove_attention = pd.read_csv('final_features/word_attention_glove/test_predictions.csv')

train_roberta_512 = pd.read_csv('final_features/512_RoBERTa_attention/train_predictions.csv')
test_roberta_512 = pd.read_csv('final_features/512_RoBERTa_attention/test_predictions.csv')

train_longformer_period = pd.read_csv('final_features/period_longformer_att/train_predictions.csv')
test_longformer_period = pd.read_csv('final_features/period_longformer_att/test_predictions.csv')

train_gemini_period = pd.read_csv('final_features/gemini_attention/train_predictions.csv')
test_gemini_period = pd.read_csv('final_features/gemini_attention/test_predictions.csv')
train_gemini_period.drop(['PeriodID'], axis=1, inplace=True)
test_gemini_period.drop(['PeriodID'], axis=1, inplace=True)

train_time_features = pd.read_csv('final_features/time_series/train_time_features.csv')
test_time_features = pd.read_csv('final_features/time_series/test_time_features.csv')

def merge_model_predictions(model_predictions_dict, is_train=True):
    """
    Merge predictions from different models into a single DataFrame.
    
    Args:
        model_predictions_dict: Dictionary with model names as keys and prediction DataFrames as values
        is_train: Boolean indicating if this is training data
    
    Returns:
        Merged DataFrame with predictions from all models
    """
    merge_cols = ['ID']
    if is_train:
        merge_cols.append('EventType')
    
    # Start with the first DataFrame
    first_model_name = list(model_predictions_dict.keys())[0]
    merged_df = model_predictions_dict[first_model_name]
    
    # Merge with remaining DataFrames
    for model_name in list(model_predictions_dict.keys())[1:]:
        merged_df = pd.merge(
            merged_df,
            model_predictions_dict[model_name],
            on=merge_cols,
            how='inner'
        )
    
    return merged_df


train_predictions = {
    'word_glove_attention': train_word_glove_attention,
    'roberta_512': train_roberta_512,
    'longformer_period': train_longformer_period,
    'gemini_period': train_gemini_period
}

train_models = merge_model_predictions(train_predictions, is_train=True)

test_predictions = {
    'word_glove_attention': test_word_glove_attention,
    'roberta_512': test_roberta_512,
    'longformer_period': test_longformer_period,
    'gemini_period': test_gemini_period
}

test_models = merge_model_predictions(test_predictions, is_train=False)

columns_to_transform = ['tweet_count', 'rolling_mean_tweets', 'rolling_std_tweets', 'relative_frequency', 'tweet_momentum']
train_time_features[columns_to_transform] = train_time_features[columns_to_transform].apply(lambda x: np.log1p(x))
test_time_features[columns_to_transform] = test_time_features[columns_to_transform].apply(lambda x: np.log1p(x))

train_time_features.drop(['EventType'], axis=1, inplace=True)

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

def prepare_data(word_glove_df, time_features_df):
    # Merge the dataframes on ID
    merged_df = pd.merge(word_glove_df, time_features_df, on=['ID', 'PeriodID'])
    
    # Sort by MatchID and PeriodID to maintain time series structure
    merged_df = merged_df.sort_values(['MatchID', 'PeriodID'])
    
    return merged_df

def create_sequences(data, match_ids, sequence_length=10, is_test=False):
    X = []
    y = []
    sequence_ids = []
    
    for match_id in match_ids:
        match_data = data[data['MatchID'] == match_id].copy()
        
        # Get feature columns
        feature_cols = [col for col in data.columns if col not in ['ID', 'MatchID', 'PeriodID', 'EventType']]
        
        # Convert to numpy array
        match_sequence = match_data[feature_cols].values
        
        # Pad sequence at the start if needed for test data
        if is_test:
            pad_length = sequence_length - 1
            padding = np.zeros((pad_length, len(feature_cols)))
            match_sequence = np.vstack([padding, match_sequence])
        
        # Create sequences
        for i in range(max(0, len(match_sequence) - sequence_length + 1)):
            X.append(match_sequence[i:(i + sequence_length)])
            
            # For test data, we want to keep track of all IDs
            if is_test:
                sequence_ids.append(match_data['ID'].iloc[i])
            else:
                sequence_ids.append(match_data['ID'].iloc[i + sequence_length - 1])
                if 'EventType' in match_data.columns:
                    y.append(match_data['EventType'].iloc[i + sequence_length - 1])
    
    return np.array(X), np.array(y) if len(y) > 0 else None, sequence_ids

def build_model(input_shape, num_classes):
    model = Sequential([
        # First LSTM layer
        LSTM(256, input_shape=input_shape, return_sequences=True),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second LSTM layer
        LSTM(128, return_sequences=True),
        BatchNormalization(),
        Dropout(0.3),
        
        # Third LSTM layer
        LSTM(64),
        BatchNormalization(),
        Dropout(0.3),
        
        # Dense layers
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def get_callbacks(model_path='models/meta_models/lstm_wgat.keras'):
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=20,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            model_path,
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

def train_and_predict(train_word_glove, train_time_features, 
                     test_word_glove, test_time_features,
                     sequence_length=10,
                     val_size=0.3):
    # Prepare training data
    train_df = prepare_data(train_word_glove, train_time_features)
    
    # Encode labels
    label_encoder = LabelEncoder()
    train_df['EventType'] = label_encoder.fit_transform(train_df['EventType'])
    
    # Get unique matches and create train/val split
    unique_matches = train_df['MatchID'].unique()
    train_idx, val_idx = train_test_split(
        np.arange(len(unique_matches)),
        test_size=val_size,
        random_state=42
    )
    
    # Split matches
    train_matches = unique_matches[train_idx]
    val_matches = unique_matches[val_idx]
    
    # Create train and validation datasets
    train_data = train_df[train_df['MatchID'].isin(train_matches)]
    val_data = train_df[train_df['MatchID'].isin(val_matches)]
    
    # Create sequences
    X_train, y_train, _ = create_sequences(train_data, train_matches, sequence_length)
    X_val, y_val, _ = create_sequences(val_data, val_matches, sequence_length)
    
    # Get input shape and number of classes
    num_features = X_train.shape[2]
    num_classes = len(label_encoder.classes_)
    
    # Build and train model
    model = build_model((sequence_length, num_features), num_classes)
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=8,
        callbacks=get_callbacks(),
        verbose=1
    )
    
    # Prepare test data
    test_data = prepare_data(test_word_glove, test_time_features)
    test_match_ids = test_data['MatchID'].unique()
    
    # Create sequences for testing with padding
    X_test, _, test_sequence_ids = create_sequences(
        test_data, 
        test_match_ids, 
        sequence_length,
        is_test=True
    )
    
    # Make predictions
    predictions = model.predict(X_test)
    predicted_classes = label_encoder.inverse_transform(predictions.argmax(axis=1))
    
    # Create submission dataframe
    submission_df = pd.DataFrame({
        'ID': test_sequence_ids,
        'EventType': predicted_classes
    })
    
    # Ensure all test IDs are present
    all_test_ids = test_data['ID'].unique()
    missing_ids = set(all_test_ids) - set(submission_df['ID'])
    
    if missing_ids:
        print(f"Found {len(missing_ids)} missing IDs. Adding them with predictions...")
        
        # For any missing IDs, use the prediction from the nearest available sequence
        for missing_id in missing_ids:
            match_id = test_data[test_data['ID'] == missing_id]['MatchID'].iloc[0]
            match_predictions = submission_df[test_data['MatchID'] == match_id]
            
            if not match_predictions.empty:
                nearest_prediction = match_predictions['EventType'].iloc[0]
            else:
                # If no predictions for this match, use most common prediction
                nearest_prediction = submission_df['EventType'].mode()[0]
            
            new_row = pd.DataFrame({
                'ID': [missing_id],
                'EventType': [nearest_prediction]
            })
            submission_df = pd.concat([submission_df, new_row], ignore_index=True)
    
    # Sort by ID to maintain consistency
    submission_df = submission_df.sort_values('ID').reset_index(drop=True)
    
    return submission_df, model


In [15]:
submission_df, trained_model = train_and_predict(
    train_models,
    train_time_features,
    test_models,
    test_time_features
)

Epoch 1/100


  super().__init__(**kwargs)


[1m161/163[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.5702 - loss: 0.9667
Epoch 1: val_accuracy improved from -inf to 0.55372, saving model to models/meta_models/lstm_wgat.keras
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.5705 - loss: 0.9653 - val_accuracy: 0.5537 - val_loss: 0.7697 - learning_rate: 0.0010
Epoch 2/100
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6599 - loss: 0.6828
Epoch 2: val_accuracy did not improve from 0.55372
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6600 - loss: 0.6827 - val_accuracy: 0.5537 - val_loss: 0.6986 - learning_rate: 0.0010
Epoch 3/100
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7498 - loss: 0.5784
Epoch 3: val_accuracy improved from 0.55372 to 0.65620, saving model to models/meta_models/lstm_wgat.keras
[1m163/163[0m [32m━━━━━━━━━━━

In [None]:

# 11. Print predictions statistics
print("\nPrediction Statistics:")
print(f"Number of predictions: {len(submission_df)}")
print(f"Distribution of predictions:")
print(submission_df['EventType'].value_counts(normalize=True))



Prediction Statistics:
Number of predictions: 516
Distribution of predictions:
EventType
1    0.637597
0    0.362403
Name: proportion, dtype: float64
