# About WikiEvents:



# Problem Posing

You want the model to read a sentence or sequence of sentences that contain multiple events and predict the chronological order in which those events occurred. The challenge is not just to extract events but to sequence them correctly based on temporal information in the text.

Summary of Inputs/Outputs

Input: A sentence with multiple events and temporal markers.

Output: The actual events in the correct chronological order.

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the WikiEvent Dataset
def load_data(input_file, target_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        input_sentences = f.readlines()

    with open(target_file, 'r', encoding='utf-8') as f:
        target_events = f.readlines()

    return input_sentences, target_events

# Tokenize the data
def tokenize_data(input_sentences, target_events):
    tokenized_inputs = [word_tokenize(sentence.lower()) for sentence in input_sentences]
    tokenized_targets = [events.strip().split(',') for events in target_events]  # Comma-separated events

    return tokenized_inputs, tokenized_targets

# Create Mappings for Events
def create_mappings(tokenized_targets):
    event_set = set(event for events in tokenized_targets for event in events)
    event_to_index = {event: i for i, event in enumerate(event_set)}
    index_to_event = {i: event for event, i in event_to_index.items()}

    return event_to_index, index_to_event

# Encode Targets
def encode_targets(tokenized_targets, event_to_index):
    encoded_targets = [[event_to_index.get(event, 0) for event in events] for events in tokenized_targets]
    return encoded_targets

# Function to convert input sequences to indices
def input_to_index(sequence, vocab):
    return [vocab.get(word, 0) for word in sequence]  # 0 for unknown words

# Pad Sequences
def pad_sequences_data(tokenized_inputs, encoded_targets, vocab, event_to_index):
    # Find the maximum length between inputs and targets
    max_input_length = max(len(seq) for seq in tokenized_inputs)
    max_target_length = max(len(seq) for seq in encoded_targets)
    max_sequence_length = max(max_input_length, max_target_length)

    # Pad both input and target sequences to the same max length
    padded_inputs = pad_sequences([input_to_index(seq, vocab) for seq in tokenized_inputs],
                                  padding='post', maxlen=max_sequence_length)
    padded_targets = pad_sequences(encoded_targets,
                                   padding='post', maxlen=max_sequence_length)

    # Reshape padded_targets to be (num_samples, max_target_length, 1)
    padded_targets = padded_targets[..., None]  # Add an extra dimension

    return padded_inputs, padded_targets, max_sequence_length  # Return max_sequence_length

# Main Function to Execute Preprocessing
def preprocess_wikievent(input_file, target_file):
    # Step 1: Load the data
    input_sentences, target_events = load_data(input_file, target_file)

    # Create DataFrames for raw data
    input_df = pd.DataFrame({'Input Sentences': input_sentences})
    target_df = pd.DataFrame({'Target Events': target_events})

    # Print DataFrame attributes
    print("Input Sentences DataFrame Attributes:")
    print(f"Shape: {input_df.shape}")
    print("Columns:", input_df.columns.tolist())
    print("\nSample Data:\n", input_df.head())

    print("\nTarget Events DataFrame Attributes:")
    print(f"Shape: {target_df.shape}")
    print("Columns:", target_df.columns.tolist())
    print("\nSample Data:\n", target_df.head())

    # Step 2: Tokenization
    tokenized_inputs, tokenized_targets = tokenize_data(input_sentences, target_events)

    # Step 3: Create Mappings
    event_to_index, index_to_event = create_mappings(tokenized_targets)

    # Step 4: Encode Targets
    encoded_targets = encode_targets(tokenized_targets, event_to_index)

    # Step 5: Padding
    padded_inputs, padded_targets, max_sequence_length = pad_sequences_data(
        tokenized_inputs, encoded_targets, event_to_index, event_to_index
    )

    # Create DataFrames for padded inputs and targets
    padded_inputs_df = pd.DataFrame(padded_inputs)
    padded_targets_flat = padded_targets.reshape(padded_targets.shape[0], padded_targets.shape[1])
    padded_targets_df = pd.DataFrame(padded_targets_flat)

    # Print DataFrame attributes for padded inputs and targets
    print("\nPadded Input Sequences DataFrame Attributes:")
    print(f"Shape: {padded_inputs_df.shape}")
    print("Columns:", padded_inputs_df.columns.tolist())
    print("\nSample Data:\n", padded_inputs_df.head())

    print("\nPadded Target Sequences DataFrame Attributes:")
    print(f"Shape: {padded_targets_df.shape}")
    print("Columns:", padded_targets_df.columns.tolist())
    print("\nSample Data:\n", padded_targets_df.head())

    return max_sequence_length, event_to_index  # Return necessary values for the next steps

# Example Usage
input_file = 'wikievent.src'  # Change to your actual input file path
target_file = 'wikievent.tgt'  # Change to your actual target file path

max_sequence_length, event_to_index = preprocess_wikievent(input_file, target_file)


Input Sentences DataFrame Attributes:
Shape: (30000, 1)
Columns: ['Input Sentences']

Sample Data:
                                      Input Sentences
0  Philippines ENTITYSEP invasion of the Philippi...
1  Battle of Mill Springs ENTITYSEP Mill Springs ...
2  Paraguayan War ENTITYSEP Brazil ENTITYSEP Urug...
3  Leoš Janáček ENTITYSEP The Makropulos Affair E...
4  Lucas Oil Stadium ENTITYSEP Wisconsin ENTITYSE...

Target Events DataFrame Attributes:
Shape: (30000, 1)
Columns: ['Target Events']

Sample Data:
                                        Target Events
0  The first invasion of the Philippines was at B...
1  The John D. Dingell , Jr. Conservation , Manag...
2  The conflict between Brazil and Uruguay was se...
3  Between 1923 and 1925 , Leoš Janáček adapted t...
4  In the Big Ten Championship held on December 3...

Padded Input Sequences DataFrame Attributes:
Shape: (30000, 304)
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the WikiEvent Dataset
def load_data(input_file, target_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        input_sentences = f.readlines()

    with open(target_file, 'r', encoding='utf-8') as f:
        target_events = f.readlines()

    return input_sentences, target_events

# Tokenize the data
def tokenize_data(input_sentences, target_events):
    tokenized_inputs = [word_tokenize(sentence.lower()) for sentence in input_sentences]
    tokenized_targets = [events.strip().split(',') for events in target_events]  # Comma-separated events

    return tokenized_inputs, tokenized_targets

# Create Mappings for Events
def create_mappings(tokenized_targets):
    event_set = set(event for events in tokenized_targets for event in events)
    event_to_index = {event: i for i, event in enumerate(event_set)}
    index_to_event = {i: event for event, i in event_to_index.items()}

    return event_to_index, index_to_event

# Encode Targets
def encode_targets(tokenized_targets, event_to_index):
    encoded_targets = [[event_to_index.get(event, 0) for event in events] for events in tokenized_targets]
    return encoded_targets

# Function to convert input sequences to indices
def input_to_index(sequence, vocab):
    return [vocab.get(word, 0) for word in sequence]  # 0 for unknown words

# Pad Sequences
def pad_sequences_data(tokenized_inputs, encoded_targets, vocab):
    # Find the maximum length between inputs and targets
    max_input_length = max(len(seq) for seq in tokenized_inputs)
    max_target_length = max(len(seq) for seq in encoded_targets)
    max_sequence_length = max(max_input_length, max_target_length)

    # Pad both input and target sequences to the same max length
    padded_inputs = pad_sequences([input_to_index(seq, vocab) for seq in tokenized_inputs],
                                  padding='post', maxlen=max_sequence_length)
    padded_targets = pad_sequences(encoded_targets,
                                   padding='post', maxlen=max_sequence_length)

    # Reshape padded_targets to be (num_samples, max_target_length, 1)
    padded_targets = padded_targets[..., None]  # Add an extra dimension

    return padded_inputs, padded_targets, max_sequence_length  # Return max_sequence_length

# Main Function to Execute Preprocessing
def preprocess_wikievent(input_file, target_file):
    # Step 1: Load the data
    input_sentences, target_events = load_data(input_file, target_file)

    # Step 2: Tokenization
    tokenized_inputs, tokenized_targets = tokenize_data(input_sentences, target_events)

    # Step 3: Create Mappings
    event_to_index, index_to_event = create_mappings(tokenized_targets)

    # Step 4: Encode Targets
    encoded_targets = encode_targets(tokenized_targets, event_to_index)

    # Step 5: Padding
    padded_inputs, padded_targets, max_sequence_length = pad_sequences_data(
        tokenized_inputs, encoded_targets, event_to_index
    )

    return padded_inputs, padded_targets, max_sequence_length, event_to_index  # Return necessary values for the next steps

# Example Usage
input_file = 'wikievent.src'  # Change to your actual input file path
target_file = 'wikievent.tgt'  # Change to your actual target file path

# Preprocess the WikiEvent data
padded_inputs, padded_targets, max_sequence_length, event_to_index = preprocess_wikievent(input_file, target_file)

# Split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_inputs, padded_targets, test_size=0.2, random_state=42)

# Create and compile the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

def create_model(vocab_size, embedding_dim, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dense(vocab_size, activation='softmax'))  # Assuming you want to output probabilities for each event

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train the model
model = create_model(len(event_to_index), embedding_dim=64, max_sequence_length=max_sequence_length)

# Fit the model with the correct target shape
history = model.fit(X_train, y_train.reshape(y_train.shape[0], y_train.shape[1]),
                    validation_data=(X_test, y_test.reshape(y_test.shape[0], y_test.shape[1])),
                    epochs=10, batch_size=32)


Epoch 1/10
[1m 38/750[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m5:01:01[0m 25s/step - accuracy: 0.8828 - loss: 9.2158

In [1]:
# Predict and return the temporal order of events (including compound events)
def predict_temporal_order(input_sentence, model, vocab, index_to_event, max_sequence_length):
    # Tokenize and pad the input sentence
    tokenized_input = word_tokenize(input_sentence.lower())
    input_indices = pad_sequences([input_to_index(tokenized_input, vocab)], maxlen=max_sequence_length, padding='post')

    # Predict the output sequence
    predictions = model.predict(input_indices)
    predicted_indices = predictions.argmax(axis=-1)[0]  # Get the index with the highest probability

    # Convert indices back to events
    predicted_events = [index_to_event.get(idx, '<UNK>') for idx in predicted_indices if idx != 0]  # Skip padding (0)

    return predicted_events


In [2]:
# Example sentence with compound events
input_sentence = "The president arrived at the meeting, gave a speech, and then left the venue."

# Predict the temporal order of events
predicted_events = predict_temporal_order(input_sentence, model, event_to_index, index_to_event, max_sequence_length)

print("Predicted Temporal Order of Events:", predicted_events)


NameError: name 'model' is not defined