In [1]:
from transformers import LongformerTokenizer, LongformerModel
from torch.utils.data import Dataset
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.models import load_model
import os
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import pandas as pd
import torch


In [2]:
train_path = '/kaggle/input/subevent-data/final_data/train_df.pkl'
train_df = pd.read_pickle(train_path)

test_path = '/kaggle/input/subevent-data/final_data/test_df.pkl'
test_df = pd.read_pickle(test_path)


print('shapes after this : ', train_df.shape, test_df.shape)

shapes after this :  (1778619, 6) (443811, 5)


In [3]:
class TweetDataset(Dataset):
    """Custom dataset for tweet data using Longformer"""
    def __init__(self, texts, max_length=4096):
        self.tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        # Longformer requires global attention mask
        global_attention_mask = torch.zeros_like(encoding['attention_mask'])
        # Set global attention on [CLS] token
        global_attention_mask[:, 0] = 1
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'global_attention_mask': global_attention_mask.squeeze()
        }


In [4]:

def get_longformer_embeddings(text, model, tokenizer, device, max_length=4096):
    """Get embeddings using Longformer model"""
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors='pt'
    )
    
    # Create global attention mask
    global_attention_mask = torch.zeros_like(encoding['attention_mask'])
    global_attention_mask[:, 0] = 1  # Global attention on [CLS] token
    
    # Move everything to device
    encoding = {k: v.to(device) for k, v in encoding.items()}
    global_attention_mask = global_attention_mask.to(device)
    
    with torch.no_grad():
        outputs = model(
            **encoding,
            global_attention_mask=global_attention_mask
        )
        # Get [CLS] token embedding
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embeddings[0]  # Return the first (and only) embedding

def precompute_all_embeddings(grouped_df, batch_size=8, max_length=4096):
    """Precompute Longformer embeddings for all periods"""
    print("Precomputing Longformer embeddings for all periods...")
    
    # Initialize model and tokenizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = LongformerModel.from_pretrained("allenai/longformer-base-4096").to(device)
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    
    model.eval()
    
    # Initialize array to store embeddings
    embeddings = np.zeros((len(grouped_df), 768))  # 768 is Longformer base embedding dim
    
    for idx, row in enumerate(grouped_df.itertuples()):
        if idx % 100 == 0:
            print(f"Processing period {idx}/{len(grouped_df)}")
        
        embeddings[idx] = get_longformer_embeddings(
            row.Tweet,
            model,
            tokenizer,
            device,
            max_length=max_length
        )
    
    return embeddings

def preprocess_data(df, is_training=True):
    """Preprocess the data by grouping tweets by period"""
    # Initialize aggregation dictionary
    agg_dict = {'Tweet': lambda x: ' '.join(x)}
    
    if 'EventType' in df.columns and is_training:
        agg_dict['EventType'] = 'first'
    if 'MatchID' in df.columns:
        agg_dict['MatchID'] = 'first'
    if 'PeriodID' in df.columns:
        agg_dict['PeriodID'] = 'first'
        
    # Group tweets by ID
    grouped_df = df.groupby('ID').agg(agg_dict).reset_index()
    
    # Calculate tweet statistics per period
    tweet_stats = df.groupby('ID').agg({
        'Tweet': ['count', lambda x: np.mean([len(t.split()) for t in x])],
        'MatchID': 'first',
        'PeriodID': 'first'
    }).reset_index()
    
    tweet_stats.columns = ['ID', 'tweet_count', 'avg_tweet_length', 'MatchID', 'PeriodID']
    
    return grouped_df, tweet_stats

def create_attention_model(input_dim, hidden_dims=[2048, 512, 128, 64]):
    """Create neural network model with additional attention layer"""
    # Input layer
    inputs = layers.Input(shape=(input_dim,))
    
    # Reshape inputs to add sequence dimension
    x = layers.Reshape((input_dim // 64, 64))(inputs)
    
    # Additional attention layer
    attention = layers.Dense(1, use_bias=False)(x)
    attention = layers.Flatten()(attention)
    attention_weights = layers.Activation('softmax')(attention)
    attention_weights = layers.RepeatVector(64)(attention_weights)
    attention_weights = layers.Permute([2, 1])(attention_weights)
    
    # Apply attention weights
    attended = layers.Multiply()([x, attention_weights])
    attended = layers.Lambda(lambda x: tf.reduce_sum(x, axis=1))(attended)
    
    # Dense layers with regularization
    for dim in hidden_dims:
        attended = layers.Dense(dim, activation='relu')(attended)
        attended = layers.BatchNormalization()(attended)
        attended = layers.Dropout(0.5)(attended)
    
    outputs = layers.Dense(1, activation='sigmoid')(attended)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

def get_callbacks(model_prefix):
    """Create callbacks for training"""
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            f'models_roberta_no_att/{model_prefix}_best_model.keras',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

In [5]:

def train_models(train_df, n_splits=5, epochs=30, batch_size=32, max_length=4096, use_attention=True):
    """Train models using cross-validation based on MatchID"""
    # Create models directory
    os.makedirs('models/longformer', exist_ok=True)
    
    # Preprocess all data first
    grouped_df, tweet_stats = preprocess_data(train_df)
    
    # Precompute all embeddings once
    print("Precomputing embeddings for all data...")
    all_embeddings = precompute_all_embeddings(grouped_df, batch_size=batch_size, max_length=max_length)
    
    # Get unique MatchIDs
    unique_matches = train_df['MatchID'].unique()
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store results
    models = []
    histories = []
    all_predictions = []
    fold_data = []
    
    # Train models
    for fold, (train_idx, val_idx) in enumerate(kf.split(unique_matches)):
        print(f"\nTraining fold {fold + 1}/{n_splits}")
        
        # Split data based on MatchID
        train_matches = unique_matches[train_idx]
        val_matches = unique_matches[val_idx]
        
        # Get training and validation data indices
        train_mask = grouped_df['MatchID'].isin(train_matches)
        val_mask = grouped_df['MatchID'].isin(val_matches)
        
        # Use pre-computed embeddings
        X_train = all_embeddings[train_mask]
        X_val = all_embeddings[val_mask]
        
        y_train = grouped_df[train_mask]['EventType'].values
        y_val = grouped_df[val_mask]['EventType'].values
        
        # Create and compile model
        if use_attention:
            model = create_attention_model(input_dim=X_train.shape[1])
        else:
            print("NO")
            
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Train model
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=get_callbacks(f'fold_{fold}'),
            verbose=1
        )
        
        # Get predictions for this fold
        val_preds = model.predict(X_val)
        fold_predictions = pd.DataFrame({
            'ID': grouped_df[val_mask]['ID'].values,
            'fold_pred': val_preds.flatten(),
            'true_label': y_val,
            'fold': fold
        })
        
        # Store results
        models.append(model)
        histories.append(history.history)
        all_predictions.append(fold_predictions)
        fold_data.append({
            'train_indices': train_idx,
            'val_indices': val_idx,
            'train_mask': train_mask,
            'val_mask': val_mask
        })
    
    # Combine all predictions
    train_preds = pd.concat(all_predictions, axis=0).reset_index(drop=True)
    
    return {
        'models': models,
        'histories': histories,
        'train_preds': train_preds,
        'grouped_df': grouped_df,
        'tweet_stats': tweet_stats,
        'fold_data': fold_data,
        'embeddings': all_embeddings
    }

In [6]:
def generate_predictions(df, embeddings, models):
    """Generate predictions from all models for given data"""
    predictions = []
    
    # Generate predictions from each model
    for fold, model in enumerate(models):
        print(f"Generating predictions for fold {fold}")
        fold_preds = model.predict(embeddings)
        
        pred_df = pd.DataFrame({
            'ID': df['ID'],
            f'fold_{fold}_pred': fold_preds.flatten()
        })
        predictions.append(pred_df)
    
    # Merge all fold predictions
    final_preds = predictions[0]
    for pred_df in predictions[1:]:
        final_preds = final_preds.merge(pred_df, on='ID')
    
    # Calculate mean prediction across folds
    pred_columns = [col for col in final_preds.columns if 'pred' in col]
    final_preds['mean_pred'] = final_preds[pred_columns].mean(axis=1)
    
    return final_preds

def create_full_predictions(trained_models, train_df, test_df, load_embeddings=True, embeddings_path='longformer_embeddings_train.npy'):
    """Create comprehensive predictions for both train and test data using Longformer embeddings"""
    # Use provided trained models
    models = trained_models
    if not models:
        raise ValueError("No models provided!")
    
    # Process train data
    print("Processing training data...")
    train_grouped, _ = preprocess_data(train_df, is_training=True)
    
    # Process test data
    print("Processing test data...")
    test_grouped, _ = preprocess_data(test_df, is_training=False)
    
    # Load or compute embeddings
    if load_embeddings and os.path.exists(embeddings_path):
        print("Loading pre-computed Longformer embeddings...")
        all_embeddings = np.load(embeddings_path)
        train_embeddings = all_embeddings[:len(train_grouped)]
        if len(test_grouped) > 0:
            test_embeddings_path = embeddings_path.replace('.npy', '_test.npy')
            if os.path.exists(test_embeddings_path):
                test_embeddings = np.load(test_embeddings_path)
            else:
                print("Computing test embeddings with Longformer...")
                test_embeddings = precompute_all_embeddings(test_grouped, max_length=4096)
                np.save(test_embeddings_path, test_embeddings)
    else:
        print("Computing embeddings with Longformer...")
        train_embeddings = precompute_all_embeddings(train_grouped, max_length=4096)
        np.save('longformer_embeddings_train.npy', train_embeddings)
        if len(test_grouped) > 0:
            test_embeddings = precompute_all_embeddings(test_grouped, max_length=4096)
            np.save('longformer_embeddings_test.npy', test_embeddings)
    
    # Generate predictions
    print("Generating training predictions...")
    train_predictions = generate_predictions(train_grouped, train_embeddings, models)
    
    # Add true labels to training predictions
    train_predictions = train_predictions.merge(
        train_grouped[['ID', 'EventType']], 
        on='ID', 
        how='left'
    )
    
    # Generate test predictions if test data exists
    test_predictions = None
    if len(test_grouped) > 0:
        print("Generating test predictions...")
        test_predictions = generate_predictions(test_grouped, test_embeddings, models)
    
    return {
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_grouped': train_grouped,
        'test_grouped': test_grouped
    }

In [7]:
results = train_models(train_df, n_splits=5, epochs=30, batch_size=8, max_length=4096, use_attention=True)

# Access the trained models
models = results['models']
train_val_preds = results['train_preds']
grouped_df = results['grouped_df']
tweet_stats = results['tweet_stats']
longformer_embeddings = results['embeddings']
np.save('longformer_embeddings_train.npy', longformer_embeddings)

# Generate all predictions
prediction_results = create_full_predictions(
    trained_models=models,
    train_df=train_df,
    test_df=test_df
)

# Access predictions
train_preds = prediction_results['train_predictions']
test_preds = prediction_results['test_predictions']

# Save predictions
train_preds.to_csv('train_predictions.csv', index=False)
test_preds.to_csv('test_predictions.csv', index=False)

Precomputing embeddings for all data...
Precomputing Longformer embeddings for all periods...


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Processing period 0/2047


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Processing period 100/2047
Processing period 200/2047
Processing period 300/2047
Processing period 400/2047
Processing period 500/2047
Processing period 600/2047
Processing period 700/2047
Processing period 800/2047
Processing period 900/2047
Processing period 1000/2047
Processing period 1100/2047
Processing period 1200/2047
Processing period 1300/2047
Processing period 1400/2047
Processing period 1500/2047
Processing period 1600/2047
Processing period 1700/2047
Processing period 1800/2047
Processing period 1900/2047
Processing period 2000/2047

Training fold 1/5
Epoch 1/30


I0000 00:00:1733963081.794106      88 service.cc:145] XLA service 0x79b86c013740 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733963081.794170      88 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 70/191[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.5235 - loss: 0.9439

I0000 00:00:1733963089.399457      88 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5454 - loss: 0.8971
Epoch 1: val_accuracy improved from -inf to 0.46731, saving model to models_roberta_no_att/fold_0_best_model.keras
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 42ms/step - accuracy: 0.5455 - loss: 0.8969 - val_accuracy: 0.4673 - val_loss: 0.6979 - learning_rate: 0.0010
Epoch 2/30
[1m181/191[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - accuracy: 0.5712 - loss: 0.7948
Epoch 2: val_accuracy improved from 0.46731 to 0.61346, saving model to models_roberta_no_att/fold_0_best_model.keras
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5724 - loss: 0.7920 - val_accuracy: 0.6135 - val_loss: 0.6707 - learning_rate: 0.0010
Epoch 3/30
[1m187/191[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5933 - loss: 0.7032
Epoch 3: val_accuracy improved from 0.61346 to 0.61923, saving mo