In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# List of individual TimeBank files
files = ['TimeBank1.tml', 'TimeBank2.tml', 'TimeBank3.tml', 'TimeBank4.tml']

# Initialize lists to store the extracted data
all_events = []
all_timexes = []

# Function to extract events and TIMEX3 from each .tml file
def extract_from_tml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    events = []
    timexes = []

    for elem in root.iter():
        if elem.tag == "EVENT":
            event_class = elem.attrib.get("class")
            event_text = elem.text.strip() if elem.text else ""
            events.append((event_class, event_text))

        elif elem.tag == "TIMEX3":
            time_type = elem.attrib.get("type")
            time_value = elem.attrib.get("value")
            timex_text = elem.text.strip() if elem.text else ""
            timexes.append((time_type, time_value, timex_text))

    return events, timexes

# Loop through all TimeBank files and extract data from each
for file in files:
    events, timexes = extract_from_tml(file)
    all_events.extend(events)
    all_timexes.extend(timexes)

# Convert the extracted data to DataFrames for easier exploration
events_df = pd.DataFrame(all_events, columns=['EventClass', 'EventText'])
timexes_df = pd.DataFrame(all_timexes, columns=['TimeType', 'TimeValue', 'TimexText'])

# Preview the extracted data
print("Events DataFrame:\n", events_df.head())
print("Timexes DataFrame:\n", timexes_df.head())


Events DataFrame:
    EventClass   EventText
0  OCCURRENCE     turning
1  OCCURRENCE  assistance
2  OCCURRENCE     helping
3  OCCURRENCE      fallen
4  OCCURRENCE        lost
Timexes DataFrame:
    TimeType   TimeValue                   TimexText
0      DATE  1998-01-08                    19980108
1  DURATION         P1W                        week
2  DURATION         P1D  the last twenty four hours
3  DURATION         P5Y                   five year
4  DURATION         P4Y                   four year


**Events DataFrame**


EventClass: Type of event (e.g., "OCCURRENCE").

EventText: Description of the event (e.g., "turning").


**Timexes DataFrame**

TimeType: Type of temporal expression (e.g., "DATE" or "DURATION").

TimeValue: Standardized temporal representation (e.g., 1998-01-08 or P1W).

TimexText: Text representation of the temporal expression (e.g., "19980108" or "the last twenty-four hours").

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Combine EventText and TimexText for tokenization
all_texts = events_df['EventText'].tolist() + timexes_df['TimexText'].tolist()

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)

# Convert EventText and TimexText to sequences
event_sequences = tokenizer.texts_to_sequences(events_df['EventText'].tolist())
timex_sequences = tokenizer.texts_to_sequences(timexes_df['TimexText'].tolist())

# Set the maximum sequence length
max_length = max(max(len(seq) for seq in event_sequences), max(len(seq) for seq in timex_sequences))

# Pad sequences
padded_event_sequences = pad_sequences(event_sequences, maxlen=max_length, padding='post')
padded_timex_sequences = pad_sequences(timex_sequences, maxlen=max_length, padding='post')



In [3]:
from datetime import datetime
import re

# Current date for comparison
current_date = datetime.now()

# Function to determine if an event has decayed
def determine_validity(event_date):
    if event_date:
        # Check if event_date is in a date format (with optional time)
        date_match = re.match(r'\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2})?', event_date)
        if date_match:
            event_datetime = datetime.fromisoformat(event_date)  # Handles both date and time
            return 1 if (current_date - event_datetime).days <= 365 else 0

        # Check if it's a duration (like P1W, P5Y)
        duration_match = re.match(r'P(\d+)([YMDW])', event_date)
        if duration_match:
            duration_value = int(duration_match.group(1))
            duration_unit = duration_match.group(2)

            # Define how to handle different duration units
            if duration_unit == 'Y':  # Years
                return 1  # Still valid
            elif duration_unit == 'M':  # Months
                return 1  # Still valid
            elif duration_unit == 'D':  # Days
                return 1 if duration_value <= 30 else 0  # Valid if less than 30 days
            elif duration_unit == 'W':  # Weeks
                return 1 if duration_value <= 4 else 0  # Valid if less than 4 weeks

    return 0  # Default to invalid if no valid date or duration

# Create labels for events based on the closest TIMEX
def get_validity_labels(events_df, timexes_df):
    labels = []
    for event in events_df['EventText']:  # Assuming event text relates to the timex
        # Here you may need to determine the relevant TIMEX for the event
        # For simplicity, we will check the first TIMEX. Modify as needed.
        # This assumes you have some mapping mechanism between events and TIMEXes
        if not timexes_df.empty:
            timex_value = timexes_df.iloc[0]['TimeValue']  # Example: using the first TIMEX
            validity = determine_validity(timex_value)
            labels.append(validity)
        else:
            labels.append(0)  # No valid TIMEX found
    return labels

# Assign labels to events
events_df['Validity'] = get_validity_labels(events_df, timexes_df)

# Check the lengths
print("Events DataFrame Length:", len(events_df))
print("Labels Length:", len(events_df['Validity']))


Events DataFrame Length: 184
Labels Length: 184


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Parameters
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding
embedding_dim = 100
max_length = padded_event_sequences.shape[1]

# Build the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [8]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example sentence
new_sentence = "The project was completed in June 2022, and the results were submitted for review."

# Preprocess the sentence: Tokenize and pad
sequence = tokenizer.texts_to_sequences([new_sentence])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

# Predict validity using the trained model
prediction = model.predict(padded_sequence)

# Interpret the prediction
validity = 'valid' if prediction[0][0] > 0.5 else 'decayed'

# Output the result
print(f"The information is {validity}.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
The information is decayed.
