In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.models import load_model
import os


In [None]:
train_path = 'final_data/train_df.pkl'
train_df = pd.read_pickle(train_path)

test_path = 'final_data/test_df.pkl'
test_df = pd.read_pickle(test_path)

train_df.shape, test_df.shape

In [None]:
class TweetDataset(Dataset):
    """Custom dataset for tweet data"""
    def __init__(self, texts, max_length=512):
        self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }


def split_long_text(text, max_length=512, tokenizer=None):
    """Split long text into chunks of approximately max_length tokens"""
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")
    
    # Tokenize without truncation to get full length
    tokens = tokenizer(text, truncation=False, return_tensors='pt')
    total_length = tokens['input_ids'].size(1)
    
    if total_length <= max_length:
        return [text]
    
    # Split text into words
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        # Get token length of current word
        word_tokens = len(tokenizer.encode(word)) - 2  # subtract 2 for special tokens
        
        if current_length + word_tokens > max_length:
            # Save current chunk and start new one
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_tokens
        else:
            current_chunk.append(word)
            current_length += word_tokens
    
    # Add final chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks




def get_roberta_embeddings_for_long_text(text, model, tokenizer, device, max_length=512, batch_size=8):
    """Get embeddings for text that might exceed max_length by averaging chunk embeddings"""
    chunks = split_long_text(text, max_length, tokenizer)
    
    # Create mini-dataset for chunks
    chunk_embeddings = []
    
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        encodings = tokenizer(
            batch_chunks,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**encodings)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            chunk_embeddings.append(batch_embeddings)
    
    # Concatenate all chunk embeddings and take mean
    all_chunk_embeddings = np.vstack(chunk_embeddings)
    return np.mean(all_chunk_embeddings, axis=0)

def precompute_all_embeddings(grouped_df, batch_size=8, max_length=512):
    """Precompute RoBERTa embeddings for all periods"""
    print("Precomputing RoBERTa embeddings for all periods...")
    
    # Initialize model and tokenizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base").to(device)
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")
    
    model.eval()
    
    # Initialize array to store embeddings
    embeddings = np.zeros((len(grouped_df), 768))  # 768 is RoBERTa base embedding dim
    
    for idx, row in enumerate(grouped_df.itertuples()):
        if idx % 100 == 0:
            print(f"Processing period {idx}/{len(grouped_df)}")
        
        embeddings[idx] = get_roberta_embeddings_for_long_text(
            row.Tweet,
            model,
            tokenizer,
            device,
            max_length=max_length,
            batch_size=batch_size
        )
    
    return embeddings

def preprocess_data(df, is_training=True):
    """Preprocess the data by grouping tweets by period"""
    # Initialize aggregation dictionary
    agg_dict = {'Tweet': lambda x: ' '.join(x)}
    
    if 'EventType' in df.columns and is_training:
        agg_dict['EventType'] = 'first'
    if 'MatchID' in df.columns:
        agg_dict['MatchID'] = 'first'
    if 'PeriodID' in df.columns:
        agg_dict['PeriodID'] = 'first'
        
    # Group tweets by ID
    grouped_df = df.groupby('ID').agg(agg_dict).reset_index()
    
    # Calculate tweet statistics per period
    tweet_stats = df.groupby('ID').agg({
        'Tweet': ['count', lambda x: np.mean([len(t.split()) for t in x])],
        'MatchID': 'first',
        'PeriodID': 'first'
    }).reset_index()
    
    tweet_stats.columns = ['ID', 'tweet_count', 'avg_tweet_length', 'MatchID', 'PeriodID']
    
    return grouped_df, tweet_stats

In [None]:

def create_attention_model(input_dim, hidden_dims=[2048, 512, 128, 64]):
    """Create neural network model with additional attention layer for BERT embeddings"""
    # Input layer
    inputs = layers.Input(shape=(input_dim,))
    
    # Reshape inputs to add sequence dimension (treating BERT features as sequence)
    x = layers.Reshape((input_dim // 64, 64))(inputs)  # Reshape to (sequence_length, feature_dim)
    
    # Additional attention layer
    attention = layers.Dense(1, use_bias=False)(x)  # (batch, seq_len, 1)
    attention = layers.Flatten()(attention)  # (batch, seq_len)
    attention_weights = layers.Activation('softmax')(attention)  # (batch, seq_len)
    attention_weights = layers.RepeatVector(64)(attention_weights)  # (batch, feature_dim, seq_len)
    attention_weights = layers.Permute([2, 1])(attention_weights)  # (batch, seq_len, feature_dim)
    
    # Apply attention weights
    attended = layers.Multiply()([x, attention_weights])
    attended = layers.Lambda(lambda x: tf.reduce_sum(x, axis=1))(attended)
    
    # Dense layers with regularization
    for dim in hidden_dims:
        attended = layers.Dense(dim, activation='relu')(attended)
        attended = layers.BatchNormalization()(attended)
        attended = layers.Dropout(0.5)(attended)
    
    outputs = layers.Dense(1, activation='sigmoid')(attended)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model


def create_model(input_dim, hidden_dims=[512, 256, 128]):
    """Create neural network model for classification"""
    inputs = layers.Input(shape=(input_dim,))
    
    x = inputs
    for dim in hidden_dims:
        x = layers.Dense(dim, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.3)(x)
    
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

def get_callbacks(model_prefix):
    """Create callbacks for training"""
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            f'models_roberta_no_att/{model_prefix}_best_model.keras',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

In [None]:

def train_models(train_df, n_splits=5, epochs=30, batch_size=32, max_length=512, use_attention=True):
    """Train models using cross-validation based on MatchID"""
    # Create models directory
    os.makedirs('models', exist_ok=True)
    
    # Preprocess all data first
    grouped_df, tweet_stats = preprocess_data(train_df)
    
    # Precompute all embeddings once
    print("Precomputing embeddings for all data...")
    all_embeddings = precompute_all_embeddings(grouped_df, batch_size=batch_size, max_length=max_length)
    
    # Get unique MatchIDs
    unique_matches = train_df['MatchID'].unique()
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store results
    models = []
    histories = []
    all_predictions = []
    fold_data = []
    
    # Train models
    for fold, (train_idx, val_idx) in enumerate(kf.split(unique_matches)):
        print(f"\nTraining fold {fold + 1}/{n_splits}")
        
        # Split data based on MatchID
        train_matches = unique_matches[train_idx]
        val_matches = unique_matches[val_idx]
        
        # Get training and validation data indices
        train_mask = grouped_df['MatchID'].isin(train_matches)
        val_mask = grouped_df['MatchID'].isin(val_matches)
        
        # Use pre-computed embeddings
        X_train = all_embeddings[train_mask]
        X_val = all_embeddings[val_mask]
        
        y_train = grouped_df[train_mask]['EventType'].values
        y_val = grouped_df[val_mask]['EventType'].values
        
        # Create and compile model
        if use_attention:
            model = create_attention_model(input_dim=X_train.shape[1])
        else:
            model = create_model(input_dim=X_train.shape[1])
            
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Train model
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=get_callbacks(f'fold_{fold}'),
            verbose=1
        )
        
        # Get predictions for this fold
        val_preds = model.predict(X_val)
        fold_predictions = pd.DataFrame({
            'ID': grouped_df[val_mask]['ID'].values,
            'fold_pred': val_preds.flatten(),
            'true_label': y_val,
            'fold': fold
        })
        
        # Store results
        models.append(model)
        histories.append(history.history)
        all_predictions.append(fold_predictions)
        fold_data.append({
            'train_indices': train_idx,
            'val_indices': val_idx,
            'train_mask': train_mask,
            'val_mask': val_mask
        })
    
    # Combine all predictions
    train_preds = pd.concat(all_predictions, axis=0).reset_index(drop=True)
    
    return {
        'models': models,
        'histories': histories,
        'train_preds': train_preds,
        'grouped_df': grouped_df,
        'tweet_stats': tweet_stats,
        'fold_data': fold_data,
        'embeddings': all_embeddings
    }

In [None]:
# Train models and get all intermediary data
results = train_models(train_df, n_splits=5, epochs=30, batch_size=8, max_length=512, use_attention=True)

# Access different components
models = results['models']
train_val_preds = results['train_preds']
grouped_df = results['grouped_df']
tweet_stats = results['tweet_stats']
roberta_embeddings = results['embeddings']
np.save('embeddings/roberta_embeddings_train.npy', roberta_embeddings)


In [None]:
def generate_predictions(df, embeddings, models):
    """Generate predictions from all models for given data"""
    predictions = []
    
    # Generate predictions from each model
    for fold, model in enumerate(models):
        print(f"Generating predictions for fold {fold}")
        fold_preds = model.predict(embeddings)
        
        pred_df = pd.DataFrame({
            'ID': df['ID'],
            f'fold_{fold}_pred': fold_preds.flatten()
        })
        predictions.append(pred_df)
    
    # Merge all fold predictions
    final_preds = predictions[0]
    for pred_df in predictions[1:]:
        final_preds = final_preds.merge(pred_df, on='ID')
    
    # Calculate mean prediction across folds
    pred_columns = [col for col in final_preds.columns if 'pred' in col]
    final_preds['mean_pred'] = final_preds[pred_columns].mean(axis=1)
    
    return final_preds

def create_full_predictions(trained_models, train_df, test_df, load_embeddings=True, embeddings_path='embeddings/roberta_embeddings_train.npy'):
    """Create comprehensive predictions for both train and test data"""
    # Load saved models
    # models = load_saved_models()
    models = trained_models
    if not models:
        raise ValueError("No saved models found!")
    
    # Process train data
    print("Processing training data...")
    train_grouped, _ = preprocess_data(train_df, is_training=True)
    
    # Process test data
    print("Processing test data...")
    test_grouped, _ = preprocess_data(test_df, is_training=False)
    
    # Load or compute embeddings
    if load_embeddings and os.path.exists(embeddings_path):
        print("Loading pre-computed embeddings...")
        all_embeddings = np.load(embeddings_path)
        train_embeddings = all_embeddings[:len(train_grouped)]
        if len(test_grouped) > 0:
            test_embeddings_path = embeddings_path.replace('.npy', '_test.npy')
            if os.path.exists(test_embeddings_path):
                test_embeddings = np.load(test_embeddings_path)
            else:
                print("Computing test embeddings...")
                test_embeddings = precompute_all_embeddings(test_grouped)
                np.save(test_embeddings_path, test_embeddings)
    else:
        print("Computing embeddings...")
        train_embeddings = precompute_all_embeddings(train_grouped)
        np.save('roberta_embeddings.npy', train_embeddings)
        if len(test_grouped) > 0:
            test_embeddings = precompute_all_embeddings(test_grouped)
            np.save('roberta_embeddings_test.npy', test_embeddings)
    
    # Generate predictions
    print("Generating training predictions...")
    train_predictions = generate_predictions(train_grouped, train_embeddings, models)
    
    # Add true labels to training predictions
    train_predictions = train_predictions.merge(
        train_grouped[['ID', 'EventType']], 
        on='ID', 
        how='left'
    )
    
    # Generate test predictions if test data exists
    test_predictions = None
    if len(test_grouped) > 0:
        print("Generating test predictions...")
        test_predictions = generate_predictions(test_grouped, test_embeddings, models)
    
    return {
        'train_predictions': train_predictions,
        'test_predictions': test_predictions,
        'train_grouped': train_grouped,
        'test_grouped': test_grouped
    }

In [None]:
# Generate all predictions
results = create_full_predictions(trained_models=models , train_df=train_df, test_df=test_df)

# Access predictions
train_preds = results['train_predictions']
test_preds = results['test_predictions']


In [None]:
train_preds.to_csv('final_features/512_RoBERTa_attention/train_predictions', index=False)
test_preds.to_csv('final_features/512_RoBERTa_attention/train_predictions', index=False)