# Preprocessing and Tokenising the TimeBank dataset

In [38]:
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure that you have downloaded the required NLTK resources
nltk.download('punkt')

def extract_events_timex3_tlinks_from_timebank(file_path, max_length=50):
    # Read the TimeBank.tml file
    with open(file_path, 'r', encoding='utf-8') as file:
        xml_content = file.read()

    # Parse the XML content
    soup = BeautifulSoup(xml_content, 'xml')

    # Prepare lists to hold extracted data
    events = []
    timex3_data = []
    tlinks_data = []
    contexts = []

    # Iterate over each <TEXT> tag
    for text_tag in soup.find_all('TEXT'):
        context = text_tag.text.strip()
        contexts.append(context)  # Collect contexts for later processing

        # Extract events
        for event in text_tag.find_all('EVENT'):
            event_id = event['eid']
            event_text = event.text.strip()
            events.append({
                'EVENT ID': event_id,
                'EVENT Text': event_text,
                'Parent Sentence': context
            })

        # Extract TIMEX3
        for timex in text_tag.find_all('TIMEX3'):
            timex_id = timex['tid']
            timex_text = timex.text.strip()
            timex3_data.append({
                'TIMEX3 ID': timex_id,
                'TIMEX3 Text': timex_text,
                'Parent Sentence': context
            })

    # Extract T-LINKS
    for tlink in soup.find_all('TLINK'):
        tlink_id = tlink.get('lid', None)
        event_id_1 = tlink.get('eventInstanceID', None)
        event_id_2 = tlink.get('relatedToEventInstance', None)
        time_id = tlink.get('relatedToTime', None)
        rel_type = tlink.get('relType', None)

        # Determine if it's event-event or event-time
        if event_id_2:
            # Event to Event Relation
            tlinks_data.append({
                'T-LINK ID': tlink_id,
                'Event ID 1': event_id_1,
                'Event ID 2': event_id_2,
                'Relation': rel_type
            })
        elif time_id:
            # Event to Time Relation
            tlinks_data.append({
                'T-LINK ID': tlink_id,
                'Event ID 1': event_id_1,
                'Time ID': time_id,
                'Relation': rel_type
            })

    # Tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(contexts)
    sequences = tokenizer.texts_to_sequences(contexts)

    # Padding
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Create DataFrames
    events_df = pd.DataFrame(events)
    timex3_df = pd.DataFrame(timex3_data)
    tlinks_df = pd.DataFrame(tlinks_data)

    return events_df, timex3_df, tlinks_df, padded_sequences, tokenizer

# Specify the path to your TimeBank.tml file
file_path = 'TimeBank.tml'  # Adjust this path as necessary

# Extract events, TIMEX3 data, T-LINKS, padded sequences, and tokenizer
events_df, timex3_df, tlinks_df, padded_sequences, tokenizer = extract_events_timex3_tlinks_from_timebank(file_path)

# Display the results with clear column headers
print("Events DataFrame:")
if not events_df.empty:
    print(events_df.to_string(index=False))
else:
    print("No events found.")

print("\nTIMEX3 DataFrame:")
if not timex3_df.empty:
    print(timex3_df.to_string(index=False))
else:
    print("No TIMEX3 data found.")

print("\nT-LINKS DataFrame:")
if not tlinks_df.empty:
    print(tlinks_df.to_string(index=False))
else:
    print("No T-LINKS found in the dataset. Please verify the dataset for <TLINK> tags.")

# Display padded sequences with headers
if padded_sequences.size > 0:
    print("\nPadded Sequences:")
    print(padded_sequences)
else:
    print("\nNo padded sequences available.")


Events DataFrame:
EVENT ID EVENT Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocessing and Tokenising TimeEval-3

In [39]:
from bs4 import BeautifulSoup
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to extract events, TIMEX3, and T-LINKS from a TimeEval-3 TML file
def extract_timeeval3_data(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'lxml-xml')  # Parse as XML

    # Extract events and their IDs along with context
    events = []
    for event in soup.find_all(lambda tag: tag.name == 'EVENT'):
        eid = event['eid']
        event_text = event.text.strip()  # Get event text
        # Get the parent <TEXT> element
        text = event.find_parent('TEXT')
        context = text.text.strip() if text else 'No context found'
        events.append((eid, event_text, context))

    # Extract temporal expressions (TIMEX3) and their IDs along with context
    timexes = []
    for timex in soup.find_all(lambda tag: tag.name == 'TIMEX3'):
        tid = timex['tid']
        timex_text = timex.text.strip()  # Get TIMEX3 text
        # Get the parent <TEXT> element
        text = timex.find_parent('TEXT')
        context = text.text.strip() if text else 'No context found'
        timexes.append((tid, timex_text, context))

    # Extract T-LINKS
    tlinks = []
    for tlink in soup.find_all('TLINK'):
        tlink_id = tlink.get('lid', None)
        event_id_1 = tlink.get('eventInstanceID', None)
        event_id_2 = tlink.get('relatedToEventInstance', None)
        time_id = tlink.get('relatedToTime', None)
        rel_type = tlink.get('relType', None)

        if event_id_2:
            # Event to Event Relation
            tlinks.append((tlink_id, event_id_1, event_id_2, None, rel_type))
        elif time_id:
            # Event to Time Relation
            tlinks.append((tlink_id, event_id_1, None, time_id, rel_type))

    return events, timexes, tlinks

# Example usage
tml_file_path = 'TimeEval3.tml'  # Ensure this path is correct
events, timexes, tlinks = extract_timeeval3_data(tml_file_path)

# Combine the text data from events and TIMEX3 expressions for tokenization
texts = [text for _, text, _ in events] + [text for _, text, _ in timexes]

# Initialize the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to a uniform length
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Display tokenized and padded sequences
print("Tokenized and Padded Sequences:")
print(padded_sequences)

# Convert the extracted data into DataFrames for easier analysis (optional)
events_df = pd.DataFrame(events, columns=['Event ID', 'Event Text', 'Parent Sentence'])
timexes_df = pd.DataFrame(timexes, columns=['TIMEX3 ID', 'TIMEX3 Text', 'Parent Sentence'])
tlinks_df = pd.DataFrame(tlinks, columns=['T-LINK ID', 'Event ID 1', 'Event ID 2', 'Time ID', 'Relation'])

# Display the extracted DataFrames with column names
print("Events DataFrame:")
print(events_df.head())  # Display the first few rows of the events DataFrame

print("\nTIMEX3 DataFrame:")
print(timexes_df.head())  # Display the first few rows of the TIMEX3 DataFrame

print("\nT-LINKS DataFrame:")
print(tlinks_df.head())  # Display the first few rows of the T-LINKS DataFrame

# Check if data was extracted
if not events and not timexes and not tlinks:
    print("No events, timexes, or t-links found.")
else:
    # Print the extracted events, timexes, and t-links with headers
    print("\nEvents:")
    print("{:<10} {:<50} {:<50}".format("Event ID", "Event Text", "Parent Sentence"))  # Header
    for eid, event_text, context in events:
        print("{:<10} {:<50} {:<50}".format(eid, event_text, context))

    print("\nTemporal Expressions (TIMEX3):")
    print("{:<10} {:<50} {:<50}".format("TIMEX3 ID", "TIMEX3 Text", "Parent Sentence"))  # Header
    for tid, timex_text, context in timexes:
        print("{:<10} {:<50} {:<50}".format(tid, timex_text, context))

    print("\nT-LINKS:")
    print("{:<10} {:<15} {:<15} {:<15} {:<15}".format("T-LINK ID", "Event ID 1", "Event ID 2", "Time ID", "Relation"))  # Header
    for tlink_id, event_id_1, event_id_2, time_id, rel_type in tlinks:
        if event_id_2:
            print("{:<10} {:<15} {:<15} {:<15} {:<15}".format(tlink_id, event_id_1, event_id_2, 'N/A', rel_type))
        elif time_id:
            print("{:<10} {:<15} {:<15} {:<15} {:<15}".format(tlink_id, event_id_1, 'N/A', time_id, rel_type))


Tokenized and Padded Sequences:
[[ 2  0  0]
 [ 3  0  0]
 [ 4  0  0]
 [ 5  0  0]
 [ 1  0  0]
 [ 1  0  0]
 [ 6  0  0]
 [ 7  0  0]
 [ 8  0  0]
 [ 9  0  0]
 [10  0  0]
 [11  0  0]
 [12  0  0]
 [13  0  0]
 [14  0  0]
 [15  0  0]
 [16  0  0]
 [17  0  0]
 [18  0  0]
 [19  0  0]
 [20  0  0]
 [21  0  0]
 [22  0  0]
 [23  0  0]
 [24  0  0]
 [25 26 27]
 [28  0  0]
 [29 30  0]]
Events DataFrame:
  Event ID Event Text                                    Parent Sentence
0       e1     dipped  Malaysian share prices dipped 1.1 percent by m...
1       e2    falling  Malaysian share prices dipped 1.1 percent by m...
2       e3     whammy  Malaysian share prices dipped 1.1 percent by m...
3       e4    reeling  Malaysian share prices dipped 1.1 percent by m...
4       e6       said  Malaysian share prices dipped 1.1 percent by m...

TIMEX3 DataFrame:
  TIMEX3 ID     TIMEX3 Text                                    Parent Sentence
0        t0  April 1 , 1997                                   No context foun

# Preprocessing and Tokenising mc-taco dataset

In [35]:
import pandas as pd
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the MCTACO dataset
mctaco_df = pd.read_csv('mctaco.tsv', sep='\t', header=None, names=["Question Context", "Question", "Answer", "Label"])

# Ensure all values in 'Question Context', 'Question', and 'Answer' are strings and handle NaNs
mctaco_df['Question Context'] = mctaco_df['Question Context'].fillna('').astype(str)
mctaco_df['Question'] = mctaco_df['Question'].fillna('').astype(str)
mctaco_df['Answer'] = mctaco_df['Answer'].fillna('').astype(str)

# Tokenize the 'Question Context', 'Question', and 'Answer' columns
mctaco_df['Tokenized Question Context'] = mctaco_df['Question Context'].apply(word_tokenize)
mctaco_df['Tokenized Question'] = mctaco_df['Question'].apply(word_tokenize)
mctaco_df['Tokenized Answer'] = mctaco_df['Answer'].apply(word_tokenize)

# Enhanced function to extract Temporal Ordering from the Label
def enhanced_extract_temporal_ordering(label):
    temporal_ordering_indicators = [
        "before", "after", "first", "next", "then", "sequence",
        "order", "subsequently", "preceding", "following", "chronology"
    ]
    return any(indicator in label.lower() for indicator in temporal_ordering_indicators)

# Apply the function to extract 'Temporal Ordering' property
mctaco_df['Temporal Ordering'] = mctaco_df['Label'].apply(enhanced_extract_temporal_ordering)

# Drop the 'Label' column
mctaco_df = mctaco_df.drop(columns=['Label'])

# Initialize the tokenizer and fit on the tokenized text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(mctaco_df['Tokenized Question Context'] + mctaco_df['Tokenized Question'] + mctaco_df['Tokenized Answer'])

# Convert the tokenized columns into sequences
mctaco_df['Question Context Sequences'] = tokenizer.texts_to_sequences(mctaco_df['Tokenized Question Context'])
mctaco_df['Question Sequences'] = tokenizer.texts_to_sequences(mctaco_df['Tokenized Question'])
mctaco_df['Answer Sequences'] = tokenizer.texts_to_sequences(mctaco_df['Tokenized Answer'])

# Pad the sequences
max_len_context = max(len(seq) for seq in mctaco_df['Question Context Sequences'])
max_len_question = max(len(seq) for seq in mctaco_df['Question Sequences'])
max_len_answer = max(len(seq) for seq in mctaco_df['Answer Sequences'])

mctaco_df['Padded Question Context Sequences'] = list(pad_sequences(mctaco_df['Question Context Sequences'], maxlen=max_len_context, padding='post'))
mctaco_df['Padded Question Sequences'] = list(pad_sequences(mctaco_df['Question Sequences'], maxlen=max_len_question, padding='post'))
mctaco_df['Padded Answer Sequences'] = list(pad_sequences(mctaco_df['Answer Sequences'], maxlen=max_len_answer, padding='post'))

# Rename columns for clarity
mctaco_df = mctaco_df[
    [
        'Question Context',
        'Question',
        'Answer',
        'Tokenized Question Context',
        'Tokenized Question',
        'Tokenized Answer',
        'Temporal Ordering',
        'Question Context Sequences',
        'Question Sequences',
        'Answer Sequences',
        'Padded Question Context Sequences',
        'Padded Question Sequences',
        'Padded Answer Sequences'
    ]
]

# Display the entire DataFrame with clear column names
print("\nPreprocessed and Tokenized Dataset with Temporal Ordering and Padded Sequences:")
print(mctaco_df.head())  # Displaying only the first few rows for brevity

# Optionally, display the shape of the final DataFrame
print("\nFinal Dataset Shape:")
print(mctaco_df.shape)



Preprocessed and Tokenized Dataset with Temporal Ordering and Padded Sequences:
                                                                Question Context  \
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   

                                                                      Question  \
Durer's father died in 1502, and his mother die...  she was ill for 30 seconds   
Durer's father died in 1502, and his mother die...               six centuries   
Durer's father died in 1502, and his mother die...    she was ill for 90 years   
Durer's father died in 1502, and his mother die...                    6 months   
Dure

Verifying if we're actualy extracting temporal ordering and its not false everywhere

In [36]:

print(mctaco_df[mctaco_df['Temporal Ordering'] == False].head(10))


print(mctaco_df[mctaco_df['Temporal Ordering'] == True].head(10))


                                                                Question Context  \
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   
Durer's father died in 1502, and his mother die...  How long was his mother ill?   

                                                                      Quest