# TimeEval-3 dataset
Events:

Each event has:
Event ID: Unique identifier (e.g., e1, e2, etc.)

Event Text: The verb or action (e.g., "dipped," "falling," etc.)

Parent Sentence: The full sentence containing the event for context.
TIMEX3 Expressions:

Each TIMEX3 expression has:

TIMEX3 ID: Unique identifier (e.g., t1, t2, etc.)
TIMEX3 Text: The temporal expression (e.g., "Tuesday," "three months," etc.)

Parent Sentence: The full sentence containing the TIMEX3 for context.

In [28]:
from bs4 import BeautifulSoup
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to extract events, TIMEX3, and T-LINKS from a TimeEval-3 TML file
def extract_timeeval3_data(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'lxml-xml')  # Parse as XML

    # Extract events and their IDs along with context
    events = []
    for event in soup.find_all(lambda tag: tag.name == 'EVENT'):
        eid = event['eid']
        event_text = event.text.strip()  # Get event text
        # Get the parent <TEXT> element
        text = event.find_parent('TEXT')
        context = text.text.strip() if text else 'No context found'
        events.append((eid, event_text, context))

    # Extract temporal expressions (TIMEX3) and their IDs along with context
    timexes = []
    for timex in soup.find_all(lambda tag: tag.name == 'TIMEX3'):
        tid = timex['tid']
        timex_text = timex.text.strip()  # Get TIMEX3 text
        # Get the parent <TEXT> element
        text = timex.find_parent('TEXT')
        context = text.text.strip() if text else 'No context found'
        timexes.append((tid, timex_text, context))

    # Extract T-LINKS
    tlinks = []
    for tlink in soup.find_all('TLINK'):
        tlink_id = tlink.get('lid', None)
        event_id_1 = tlink.get('eventInstanceID', None)
        event_id_2 = tlink.get('relatedToEventInstance', None)
        time_id = tlink.get('relatedToTime', None)
        rel_type = tlink.get('relType', None)

        if event_id_2:
            # Event to Event Relation
            tlinks.append((tlink_id, event_id_1, event_id_2, None, rel_type))
        elif time_id:
            # Event to Time Relation
            tlinks.append((tlink_id, event_id_1, None, time_id, rel_type))

    return events, timexes, tlinks

# Example usage
tml_file_path = 'TimeEval3.tml'  # Ensure this path is correct
events, timexes, tlinks = extract_timeeval3_data(tml_file_path)

# Combine the text data from events and TIMEX3 expressions for tokenization
texts = [text for _, text, _ in events] + [text for _, text, _ in timexes]

# Initialize the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to a uniform length
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Display tokenized and padded sequences
print("Tokenized and Padded Sequences:")
print(padded_sequences)

# Convert the extracted data into DataFrames for easier analysis (optional)
events_df = pd.DataFrame(events, columns=['Event ID', 'Event Text', 'Parent Sentence'])
timexes_df = pd.DataFrame(timexes, columns=['TIMEX3 ID', 'TIMEX3 Text', 'Parent Sentence'])
tlinks_df = pd.DataFrame(tlinks, columns=['T-LINK ID', 'Event ID 1', 'Event ID 2', 'Time ID', 'Relation'])

# Display the extracted DataFrames
print("Events DataFrame:")
print(events_df.head())  # Display the first few rows of the events DataFrame

print("\nTIMEX3 DataFrame:")
print(timexes_df.head())  # Display the first few rows of the TIMEX3 DataFrame

print("\nT-LINKS DataFrame:")
print(tlinks_df.head())  # Display the first few rows of the T-LINKS DataFrame

# Check if data was extracted
if not events and not timexes and not tlinks:
    print("No events, timexes, or t-links found.")
else:
    # Print the extracted events, timexes, and t-links
    print("Events:")
    for eid, event_text, context in events:
        print(f"Event ID: {eid}, Event Text: {event_text}, Parent Sentence: {context}")

    print("\nTemporal Expressions (TIMEX3):")
    for tid, timex_text, context in timexes:
        print(f"TIMEX3 ID: {tid}, TIMEX3 Text: {timex_text}, Parent Sentence: {context}")

    print("\nT-LINKS:")
    for tlink_id, event_id_1, event_id_2, time_id, rel_type in tlinks:
        if event_id_2:
            print(f"T-LINK ID: {tlink_id}, Event ID 1: {event_id_1}, Event ID 2: {event_id_2}, Relation: {rel_type}")
        elif time_id:
            print(f"T-LINK ID: {tlink_id}, Event ID 1: {event_id_1}, Time ID: {time_id}, Relation: {rel_type}")



Tokenized and Padded Sequences:
[[ 2  0  0]
 [ 3  0  0]
 [ 4  0  0]
 [ 5  0  0]
 [ 1  0  0]
 [ 1  0  0]
 [ 6  0  0]
 [ 7  0  0]
 [ 8  0  0]
 [ 9  0  0]
 [10  0  0]
 [11  0  0]
 [12  0  0]
 [13  0  0]
 [14  0  0]
 [15  0  0]
 [16  0  0]
 [17  0  0]
 [18  0  0]
 [19  0  0]
 [20  0  0]
 [21  0  0]
 [22  0  0]
 [23  0  0]
 [24  0  0]
 [25 26 27]
 [28  0  0]
 [29 30  0]]
Events DataFrame:
  Event ID Event Text                                    Parent Sentence
0       e1     dipped  Malaysian share prices dipped 1.1 percent by m...
1       e2    falling  Malaysian share prices dipped 1.1 percent by m...
2       e3     whammy  Malaysian share prices dipped 1.1 percent by m...
3       e4    reeling  Malaysian share prices dipped 1.1 percent by m...
4       e6       said  Malaysian share prices dipped 1.1 percent by m...

TIMEX3 DataFrame:
  TIMEX3 ID     TIMEX3 Text                                    Parent Sentence
0        t0  April 1 , 1997                                   No context foun

# Preprocessing and Tokenising the TimeBank dataset

In [27]:
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure that you have downloaded the required NLTK resources
nltk.download('punkt')

def extract_events_timex3_tlinks_from_timebank(file_path, max_length=50):
    # Read the TimeBank.tml file
    with open(file_path, 'r', encoding='utf-8') as file:
        xml_content = file.read()

    # Parse the XML content
    soup = BeautifulSoup(xml_content, 'xml')

    # Prepare lists to hold extracted data
    events = []
    timex3_data = []
    tlinks_data = []
    contexts = []

    # Iterate over each <TEXT> tag
    for text_tag in soup.find_all('TEXT'):
        context = text_tag.text.strip()
        contexts.append(context)  # Collect contexts for later processing

        # Extract events
        for event in text_tag.find_all('EVENT'):
            event_id = event['eid']
            event_text = event.text.strip()
            events.append({
                'EVENT ID': event_id,
                'EVENT Text': event_text,
                'Parent Sentence': context
            })

        # Extract TIMEX3
        for timex in text_tag.find_all('TIMEX3'):
            timex_id = timex['tid']
            timex_text = timex.text.strip()
            timex3_data.append({
                'TIMEX3 ID': timex_id,
                'TIMEX3 Text': timex_text,
                'Parent Sentence': context
            })

    # Extract T-LINKS
    for tlink in soup.find_all('TLINK'):
        tlink_id = tlink.get('lid', None)
        event_id_1 = tlink.get('eventInstanceID', None)
        event_id_2 = tlink.get('relatedToEventInstance', None)
        time_id = tlink.get('relatedToTime', None)
        rel_type = tlink.get('relType', None)

        # Determine if it's event-event or event-time
        if event_id_2:
            # Event to Event Relation
            tlinks_data.append({
                'T-LINK ID': tlink_id,
                'Event ID 1': event_id_1,
                'Event ID 2': event_id_2,
                'Relation': rel_type
            })
        elif time_id:
            # Event to Time Relation
            tlinks_data.append({
                'T-LINK ID': tlink_id,
                'Event ID 1': event_id_1,
                'Time ID': time_id,
                'Relation': rel_type
            })

    # Tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(contexts)
    sequences = tokenizer.texts_to_sequences(contexts)

    # Padding
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Create DataFrames
    events_df = pd.DataFrame(events)
    timex3_df = pd.DataFrame(timex3_data)
    tlinks_df = pd.DataFrame(tlinks_data)

    return events_df, timex3_df, tlinks_df, padded_sequences, tokenizer

# Specify the path to your TimeBank.tml file
file_path = 'TimeBank.tml'  # Adjust this path as necessary

# Extract events, TIMEX3 data, T-LINKS, padded sequences, and tokenizer
events_df, timex3_df, tlinks_df, padded_sequences, tokenizer = extract_events_timex3_tlinks_from_timebank(file_path)

# Display the results
print("Events DataFrame:")
print(events_df)
print("\nTIMEX3 DataFrame:")
print(timex3_df)
print("\nT-LINKS DataFrame:")
print(tlinks_df)

# Check if T-LINKS are empty
if tlinks_df.empty:
    print("\nNo T-LINKS found in the dataset. Please verify the dataset for <TLINK> tags.")
else:
    print("\nPadded Sequences:")
    print(padded_sequences)


Events DataFrame:
   EVENT ID  EVENT Text                                    Parent Sentence
0        e1    watching  NEW YORK _ A Brooklyn woman who was watching h...
1        e2      killed  NEW YORK _ A Brooklyn woman who was watching h...
2        e4     emptied  NEW YORK _ A Brooklyn woman who was watching h...
3        e6        said  NEW YORK _ A Brooklyn woman who was watching h...
4        e7    appeared  NEW YORK _ A Brooklyn woman who was watching h...
5        e9    finished  NEW YORK _ A Brooklyn woman who was watching h...
6       e49    cleaning  NEW YORK _ A Brooklyn woman who was watching h...
7       e10     waiting  NEW YORK _ A Brooklyn woman who was watching h...
8       e11         dry  NEW YORK _ A Brooklyn woman who was watching h...
9       e12        said  NEW YORK _ A Brooklyn woman who was watching h...
10      e13       looks  NEW YORK _ A Brooklyn woman who was watching h...
11      e14    panicked  NEW YORK _ A Brooklyn woman who was watching h...
12     

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocessing and Tokenising mc-taco dataset

In [34]:
import pandas as pd
import nltk
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the MC-TACO dataset
file_path = 'mc-taco.tsv'  # Update this path if needed
df = pd.read_csv(file_path, sep='\t', header=None)

# Filter by type "Event Ordering"
event_ordering_df = df[df[2] == "Event Ordering"]

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply preprocessing
event_ordering_df['cleaned_text'] = event_ordering_df[1].apply(preprocess_text)

# Tokenize the text
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(event_ordering_df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(event_ordering_df['cleaned_text'])

# Filter out empty sequences
sequences = [seq for seq in sequences if seq]

# Check if there are any non-empty sequences
if sequences:
    # Padding the sequences
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Display all data along with tokens and padded sequences
    print("Original Text | Cleaned Text | Tokens | Padded Sequences")
    print("-------------------------------------------------------")
    for index, (original, cleaned, seq, padded_seq) in enumerate(zip(event_ordering_df[1], event_ordering_df['cleaned_text'], sequences, padded_sequences)):
        print(f"{original} | {cleaned} | {seq} | {padded_seq}")
else:
    print("No non-empty sequences found after preprocessing.")


No non-empty sequences found after preprocessing.


In [38]:
import pandas as pd

# Load the MC-TACO dataset
file_path = 'mc-taco.tsv'  # Update this path if needed
df = pd.read_csv(file_path, sep='\t', header=None)

# Display the entire dataset before filtering
print("Dataset Preview Before Filtering:")
print(df.head())

# Drop the first column (if it's not needed)
df = df.drop(columns=[0])

# Remove rows where the values are 'Stationarity' or 'Event Duration'
df = df[~df[1].isin(['Stationarity', 'Event Duration'])]

# Display the entire dataset after filtering
print("\nDataset Preview After Filtering:")
print(df.head())

# Optionally, display the shape of the dataset
print("\nDataset Shape After Filtering:")
print(df.shape)


Dataset Preview Before Filtering:
                                                   0  \
0  Islam later emerged as the majority religion d...   
1  Islam later emerged as the majority religion d...   
2  Islam later emerged as the majority religion d...   
3  Islam later emerged as the majority religion d...   
4  Islam later emerged as the majority religion d...   

                                                   1                 2    3  \
0              Is Islam still the majority religion?  it sometimes was   no   
1              Is Islam still the majority religion?               yes  yes   
2              Is Islam still the majority religion?      it never was   no   
3  How long has a significant Christian minority ...           a month   no   
4  How long has a significant Christian minority ...   a billion weeks   no   

                4  
0    Stationarity  
1    Stationarity  
2    Stationarity  
3  Event Duration  
4  Event Duration  

Dataset Preview After Filtering:
