In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

# Step 1: Extract sentences from MATRES dataset
def extract_sentences_from_matres(matres_file):
    print(f"Processing file: {matres_file}")
    matres_data = pd.read_csv(matres_file)  # Load the CSV file

    print(matres_data.columns)  # Display column names for debugging

    # Using 'bodytext' as the column containing the sentences
    sentences_column = 'bodytext'  # Ensure this is the correct column
    sentences = matres_data[sentences_column].dropna().tolist()  # Drop NaN values

    print(f"Extracted {len(sentences)} sentences from {matres_file}")

    return sentences

# Step 2: Generate embeddings (mock implementation for now)
def generate_embeddings(sentences):
    print(f"Generating embeddings for {len(sentences)} sentences")

    # In reality, replace this with actual embedding generation (e.g., using a pre-trained model like BERT)
    embeddings = np.random.rand(len(sentences), 300)  # Mock: Random embeddings with 300 dimensions

    print(f"Generated embeddings shape: {embeddings.shape}")
    return embeddings

# Step 3: Extract temporal cues (mock implementation for now)
def extract_temporal_cues(sentences):
    print(f"Extracting temporal cues for {len(sentences)} sentences")

    # Mocking relevance scores for simplicity, real implementation would analyze sentences for cues
    relevance_scores = np.random.randint(0, 2, size=len(sentences))  # Random binary labels (0 or 1)

    print(f"Relevance scores: {relevance_scores[:5]}")  # Print sample of relevance scores
    return relevance_scores

# Step 4: Load sentences from MATRES
matres_sentences = extract_sentences_from_matres('MATRES.csv')

# Step 5: Generate embeddings
embeddings = generate_embeddings(matres_sentences)

# Step 6: Extract relevance scores
relevance_scores = extract_temporal_cues(matres_sentences)

# Step 7: Prepare the data for training
X = np.array(embeddings)  # Input embeddings
y = np.array(relevance_scores)  # Binary temporal relevance labels (1 or 0)

# Step 8: Split into training and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Build the LSTM model
model = models.Sequential()
model.add(layers.Input(shape=(X.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Step 10: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 11: Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

# Step 12: Test the model with a random sentence
test_sentence = "The meeting will take place after lunch."
test_embedding = generate_embeddings([test_sentence])  # Mock embedding generation for the test sentence
predicted_relevance = model.predict(test_embedding)
print(f"Predicted relevance score (1: relevant, 0: not relevant): {1 if predicted_relevance[0][0] > 0.5 else 0}")


Processing file: MATRES.csv
Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time:confidence',
       'after', 'before', 'bodytext',
       'can_the_verb_span_stylecolorblueverb_span_be_anchored_in_time_gold',
       'docid', 'eventid', 'verb'],
      dtype='object')
Extracted 1188 sentences from MATRES.csv
Generating embeddings for 1188 sentences
Generated embeddings shape: (1188, 300)
Extracting temporal cues for 1188 sentences
Relevance scores: [1 1 0 0 0]
Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.4752 - loss: 0.8157 - val_accuracy: 0.4748 - val_loss: 0.7114
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5239 - loss: 0.7033 - val_accuracy: 0.4874 - val_loss: 0.6963
Epoch 3/10
[1m30/30[0m [32m━━