In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# SECTION 1: Data Extraction
def extract_sentences_from_matres(matres_file):
    """Extract sentences from the MATRES dataset."""
    print(f"Processing file: {matres_file}")
    matres_data = pd.read_csv(matres_file)  # Load the CSV file

    print(matres_data.columns)  # Display column names for debugging

    # Using 'bodytext' as the column containing the sentences
    sentences_column = 'bodytext'  # Ensure this is the correct column
    sentences = matres_data[sentences_column].dropna().tolist()  # Drop NaN values

    print(f"Extracted {len(sentences)} sentences from {matres_file}")

    return sentences

# SECTION 2: Embedding Generation
def generate_embeddings(sentences):
    """Generate embeddings using BERT."""
    print(f"Generating embeddings for {len(sentences)} sentences")
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():  # Disable gradient calculations
            outputs = bert_model(**inputs)
            # Use the mean of the last hidden states as the embedding
            sentence_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.append(sentence_embedding)

    # Convert to numpy array
    embeddings = np.vstack(embeddings)
    print(f"Generated embeddings shape: {embeddings.shape}")
    return embeddings

# SECTION 3: Temporal Cue Extraction
def extract_temporal_cues(sentences, temporal_keywords):
    """Extract temporal cues based on specified keywords."""
    print(f"Extracting temporal cues for {len(sentences)} sentences")
    relevance_scores = []

    for sentence in sentences:
        # Check for the presence of temporal keywords
        if any(keyword in sentence for keyword in temporal_keywords):
            relevance_scores.append(1)  # Relevant
        else:
            relevance_scores.append(0)  # Not relevant

    print(f"Relevance scores: {relevance_scores[:5]}")  # Print sample of relevance scores
    return np.array(relevance_scores)

# SECTION 4: Main Workflow
def main(matres_file, temporal_keywords):
    # Step 1: Load sentences from MATRES
    matres_sentences = extract_sentences_from_matres(matres_file)

    # Step 2: Generate embeddings
    embeddings = generate_embeddings(matres_sentences)

    # Step 3: Extract relevance scores
    relevance_scores = extract_temporal_cues(matres_sentences, temporal_keywords)

    # Step 4: Prepare the data for training
    X = np.array(embeddings)  # Input embeddings
    y = np.array(relevance_scores)  # Binary temporal relevance labels (1 or 0)

    # Step 5: Split into training and validation datasets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 6: Build the LSTM model
    model = models.Sequential()
    model.add(layers.Input(shape=(X.shape[1],)))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))  # Output layer for binary classification

    # Step 7: Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Step 8: Train the model
    model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

    # Step 9: Test the model with a random sentence
    test_sentence = "The meeting will take place after lunch."
    test_embedding = generate_embeddings([test_sentence])  # Generate actual embedding for the test sentence
    predicted_relevance = model.predict(test_embedding)
    print(f"Predicted relevance score (1: relevant, 0: not relevant): {1 if predicted_relevance[0][0] > 0.5 else 0}")

# SECTION 5: Execution
if __name__ == "__main__":
    matres_file = 'MATRES.csv'  # Path to the MATRES CSV file
    temporal_keywords = ['after', 'before', 'during', 'until', 'while', 'as soon as', 'when']  # Keywords for cue extraction
    main(matres_file, temporal_keywords)




Processing file: MATRES.csv
Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time:confidence',
       'after', 'before', 'bodytext',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time_gold',
       'docid', 'eventid', 'verb'],
      dtype='object')
Extracted 1188 sentences from MATRES.csv
Generating embeddings for 1188 sentences
Generated embeddings shape: (1188, 768)
Extracting temporal cues for 1188 sentences
Relevance scores: [0, 0, 0, 0, 0]
Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6661 - loss: 0.6283 - val_accuracy: 0.7353 - val_loss: 0.5681
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7649 - loss: 0.5542 - val_accuracy: 0.7353 - val_loss: 0.5458
Epoch 3/10
[1m30/30[0m [3