In [12]:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Function to parse a TML file (TimeML format) and extract events, T-LINKs, and TIMEX3
def parse_tml_with_context(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    events = []
    tlinks = []
    timex3s = []

    # Extract events and TIMEX3
    for s in root.iter('TEXT'):
        sentence_text = s.text
        for event in s.iter('EVENT'):
            event_id = event.attrib['eid']
            event_text = event.text
            events.append({'EVENT ID': event_id, 'EVENT Text': event_text, 'Context Sentence': sentence_text})

        for timex in s.iter('TIMEX3'):
            timex_id = timex.attrib['tid']
            timex_text = timex.text
            timex3s.append({'TIMEX3 ID': timex_id, 'TIMEX3 Text': timex_text})

    # Extract T-LINKs
    for tlink in root.iter('TLINK'):
        event_id_1 = tlink.attrib.get('eventInstanceID')
        event_id_2 = tlink.attrib.get('relatedToEventInstance')
        relation = tlink.attrib.get('relType')

        if event_id_1 and event_id_2:
            tlinks.append({'Event ID 1': event_id_1, 'Event ID 2': event_id_2, 'Relation': relation})

    events_df = pd.DataFrame(events)
    timex3_df = pd.DataFrame(timex3s)
    tlinks_df = pd.DataFrame(tlinks)

    return events_df, timex3_df, tlinks_df

# Load the datasets
timebank_events_df, timebank_timex3_df, timebank_tlinks_df = parse_tml_with_context('TimeBank.tml')
timeeval3_events_df, timeeval3_timex3_df, timeeval3_tlinks_df = parse_tml_with_context('TimeEval3.tml')

# Combine the datasets
combined_events_df = pd.concat([timebank_events_df, timeeval3_events_df], ignore_index=True)
combined_timex3_df = pd.concat([timebank_timex3_df, timeeval3_timex3_df], ignore_index=True)
combined_tlinks_df = pd.concat([timebank_tlinks_df, timeeval3_tlinks_df], ignore_index=True)

# Tokenization and padding
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenize and pad sequences
        tokenized_text = self.tokenizer(text)
        # Ensure padding length matches max_length
        if len(tokenized_text) < self.max_length:
            padded_text = np.pad(tokenized_text, (0, self.max_length - len(tokenized_text)), 'constant')
        else:
            padded_text = tokenized_text[:self.max_length]
        return torch.tensor(padded_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Prepare input data for events
class Tokenizer:
    def __init__(self):
        self.word_index = {}
        self.index_word = {}

    def fit_on_texts(self, texts):
        words = set()
        for text in texts:
            for word in text.split():
                words.add(word)

        self.word_index = {word: i + 1 for i, word in enumerate(words)}
        self.index_word = {i + 1: word for i, word in enumerate(words)}

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = [self.word_index.get(word, 0) for word in text.split()]
            sequences.append(sequence)
        return sequences

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_events_df['EVENT Text'].tolist())
event_sequences = tokenizer.texts_to_sequences(combined_events_df['EVENT Text'].tolist())

# Create encoded labels
relation_mapping = {relation: idx for idx, relation in enumerate(combined_tlinks_df['Relation'].unique())}
encoded_labels = []
for index, row in combined_events_df.iterrows():
    relation = combined_tlinks_df[
        (combined_tlinks_df['Event ID 1'] == row['EVENT ID']) |
        (combined_tlinks_df['Event ID 2'] == row['EVENT ID'])
    ]['Relation']

    if not relation.empty:
        encoded_labels.append(relation_mapping[relation.values[0]])
    else:
        encoded_labels.append(len(relation_mapping))  # Adjust if needed

# Convert to a numpy array
encoded_labels = np.array(encoded_labels)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(combined_events_df['EVENT Text'].tolist(), encoded_labels, test_size=0.2, random_state=42)

# Create datasets
max_length = max(len(seq) for seq in event_sequences)  # Calculate maximum sequence length
train_dataset = TextDataset(X_train, y_train, tokenizer=tokenizer.texts_to_sequences, max_length=max_length)
val_dataset = TextDataset(X_val, y_val, tokenizer=tokenizer.texts_to_sequences, max_length=max_length)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x[:, -1, :])  # Get the last time step output
        return x

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1  # Size of the vocabulary
embedding_dim = 128  # Dimension of the embedding layer
hidden_dim = 64  # Number of hidden units
num_classes = len(relation_mapping) + 1  # Number of classes including "no relation"

# Create model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        optimizer.zero_grad()

        # Reshape the input to be 2D: (batch_size, sequence_length)
        outputs = model(texts.view(texts.size(0), -1))  # Flatten to ensure it is 2D
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts.view(texts.size(0), -1))  # Ensure it is 2D
            val_loss += criterion(outputs, labels).item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

# Function to order events based on model predictions
def predict_event_order(sentence, model, tokenizer, maxlen):
    events = sentence.split(", ")  # Simple event extraction from the input sentence
    event_sequences = tokenizer.texts_to_sequences(events)

    # Prepare padded sequences
    padded_sequences = np.array([np.pad(seq, (0, maxlen - len(seq)), 'constant') if len(seq) < maxlen else seq[:maxlen] for seq in event_sequences])
    with torch.no_grad():
        predictions = model(torch.tensor(padded_sequences, dtype=torch.long))

    # Get the predicted order by selecting the highest probability for each event
    predicted_order = np.argsort(np.argmax(predictions.numpy(), axis=1))

    # Return events in the predicted order
    ordered_events = [events[i] for i in predicted_order]
    return ordered_events

# Define a sample sentence with unordered events
sample_sentence = "Alice went for a meeting, woke up, made breakfast, attended a meeting, and then went for a run."

# Predict and print the ordered events
ordered_events = predict_event_order(sample_sentence, model, tokenizer, max_length)

print("\nOriginal Events:")
print(sample_sentence)

print("\nPredicted Events in Correct Temporal Order:")
print(ordered_events)


Epoch [1/10], Loss: 1.6779, Val Loss: 1.4991
Epoch [2/10], Loss: 1.3449, Val Loss: 1.2096
Epoch [3/10], Loss: 1.0538, Val Loss: 0.9443
Epoch [4/10], Loss: 0.8205, Val Loss: 0.7113
Epoch [5/10], Loss: 0.6195, Val Loss: 0.5176
Epoch [6/10], Loss: 0.5162, Val Loss: 0.3657
Epoch [7/10], Loss: 0.3279, Val Loss: 0.2533
Epoch [8/10], Loss: 0.2390, Val Loss: 0.1739
Epoch [9/10], Loss: 0.1541, Val Loss: 0.1198
Epoch [10/10], Loss: 0.1051, Val Loss: 0.0839

Original Events:
Alice went for a meeting, woke up, made breakfast, attended a meeting, and then went for a run.

Predicted Events in Correct Temporal Order:
['Alice went for a meeting', 'woke up', 'made breakfast', 'attended a meeting', 'and then went for a run.']


In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# After validation step
model.eval()
val_predictions = []
val_labels = []

with torch.no_grad():
    for texts, labels in val_loader:
        outputs = model(texts.view(texts.size(0), -1))
        _, predicted = torch.max(outputs.data, 1)

        val_predictions.extend(predicted.numpy())
        val_labels.extend(labels.numpy())

# Calculate metrics
accuracy = accuracy_score(val_labels, val_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')



Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
