In [4]:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

# Function to parse a TML file (TimeML format) and extract events, T-LINKs, and TIMEX3
def parse_tml_with_context(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    events = []
    tlinks = []
    timex3s = []

    # Extract events and TIMEX3
    for s in root.iter('TEXT'):
        sentence_text = s.text
        for event in s.iter('EVENT'):
            event_id = event.attrib['eid']
            event_text = event.text
            events.append({'EVENT ID': event_id, 'EVENT Text': event_text, 'Context Sentence': sentence_text})

        for timex in s.iter('TIMEX3'):
            timex_id = timex.attrib['tid']
            timex_text = timex.text
            timex3s.append({'TIMEX3 ID': timex_id, 'TIMEX3 Text': timex_text})

    # Extract T-LINKs
    for tlink in root.iter('TLINK'):
        event_id_1 = tlink.attrib.get('eventInstanceID')
        event_id_2 = tlink.attrib.get('relatedToEventInstance')
        relation = tlink.attrib.get('relType')

        if event_id_1 and event_id_2:
            tlinks.append({'Event ID 1': event_id_1, 'Event ID 2': event_id_2, 'Relation': relation})

    events_df = pd.DataFrame(events)
    timex3_df = pd.DataFrame(timex3s)
    tlinks_df = pd.DataFrame(tlinks)

    return events_df, timex3_df, tlinks_df

# Load the datasets
timebank_events_df, timebank_timex3_df, timebank_tlinks_df = parse_tml_with_context('TimeBank.tml')
timeeval3_events_df, timeeval3_timex3_df, timeeval3_tlinks_df = parse_tml_with_context('TimeEval3.tml')

# Combine the datasets
combined_events_df = pd.concat([timebank_events_df, timeeval3_events_df], ignore_index=True)
combined_timex3_df = pd.concat([timebank_timex3_df, timeeval3_timex3_df], ignore_index=True)
combined_tlinks_df = pd.concat([timebank_tlinks_df, timeeval3_tlinks_df], ignore_index=True)

# Prepare input data for events
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_events_df['EVENT Text'].tolist())  # Fit on event texts
event_sequences = tokenizer.texts_to_sequences(combined_events_df['EVENT Text'].tolist())  # Convert texts to sequences

# Pad event sequences
padded_event_sequences = pad_sequences(event_sequences, padding='post', dtype='int32')  # Pad event sequences

# Create encoded labels
relation_mapping = {relation: idx for idx, relation in enumerate(combined_tlinks_df['Relation'].unique())}
encoded_labels = []
for index, row in combined_events_df.iterrows():
    # Find the corresponding relation for the event
    relation = combined_tlinks_df[
        (combined_tlinks_df['Event ID 1'] == row['EVENT ID']) |
        (combined_tlinks_df['Event ID 2'] == row['EVENT ID'])
    ]['Relation']

    if not relation.empty:
        encoded_labels.append(relation_mapping[relation.values[0]])
    else:
        # Assign a valid class index for "no relation"
        encoded_labels.append(len(relation_mapping))  # Adjust if needed

# Convert to a numpy array
encoded_labels = np.array(encoded_labels)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_event_sequences, encoded_labels, test_size=0.2, random_state=42)

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1  # Size of the vocabulary
embedding_dim = 128  # Dimension of the embedding layer
max_length = X_train.shape[1]  # Maximum length of the input sequences
num_classes = len(relation_mapping) + 1  # Number of classes including "no relation"

# Build the LSTM model with Bidirectional LSTM for better performance
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001))))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, kernel_regularizer=l2(0.001))))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.001)))  # +1 for "no relation" class

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Function to prepare event sequences for a given sentence
def prepare_event_sequences(events, tokenizer, maxlen):
    # Convert events to sequences
    event_sequences = tokenizer.texts_to_sequences(events)
    # Pad sequences
    padded_sequences = pad_sequences(event_sequences, padding='post', maxlen=maxlen)
    return padded_sequences

# Function to order events based on model predictions
def predict_event_order(sentence, model, tokenizer, maxlen):
    # Simple event extraction from the input sentence (for demonstration purposes)
    events = sentence.split(", ")  # You can implement more sophisticated extraction methods

    # Tokenize and pad the event sequences
    event_sequences = prepare_event_sequences(events, tokenizer, maxlen)

    # Make predictions (probabilities for each class)
    predictions = model.predict(event_sequences)

    # Get the predicted order by selecting the highest probability for each event
    predicted_order = np.argsort(np.argmax(predictions, axis=1))  # Sort by predicted indices

    # Return events in the predicted order
    ordered_events = [events[i] for i in predicted_order]
    return ordered_events

# Define a sample sentence with unordered events
sample_sentence = "Alice woke up, made breakfast, attended a meeting, and then went for a run."

# Predict and print the ordered events
ordered_events = predict_event_order(sample_sentence, model, tokenizer, max_length)

print("\nOriginal Events:")
print(sample_sentence)

print("\nPredicted Events in Correct Temporal Order:")
for event in ordered_events:
    print(f"- {event}")


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 797ms/step - accuracy: 0.2339 - loss: 2.6373 - val_accuracy: 1.0000 - val_loss: 2.5976
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 2.5946 - val_accuracy: 1.0000 - val_loss: 2.5560
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 2.5526 - val_accuracy: 1.0000 - val_loss: 2.5152
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 2.5120 - val_accuracy: 1.0000 - val_loss: 2.4749
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 2.4701 - val_accuracy: 1.0000 - val_loss: 2.4350
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 1.0000 - loss: 2.4316 - val_accuracy: 1.0000 - val_loss: 2.3953
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━