In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('MATRES.csv')

# Select relevant columns
data = data[['bodytext', 'after', 'before', 'verb']]

# Drop any rows with NaN values in the relevant columns
data = data.dropna(subset=['bodytext', 'after', 'before', 'verb'])

# Define labels based on the 'after' and 'before' columns
data['label'] = np.where(data['after'] == 'yes', 1, 0)  # Assuming 'yes' indicates a valid match

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['bodytext'].tolist() + data['verb'].tolist())

# Tokenize and pad sequences
def preprocess_data(data):
    context_sequences = tokenizer.texts_to_sequences(data['bodytext'])
    padded_context = pad_sequences(context_sequences, padding='post')
    return padded_context, data['label'].values

# Preprocess the data
padded_context, labels = preprocess_data(data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_context, labels, test_size=0.2, random_state=42)

# Set parameters
embedding_dim = 100
lstm_units = 64

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim))
model.add(LSTM(lstm_units))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Function to predict validity
def predict_validity(context, target):
    # Preprocess input
    context_seq = tokenizer.texts_to_sequences([context])
    target_seq = tokenizer.texts_to_sequences([target])
    padded_context = pad_sequences(context_seq, padding='post')

    # Create input data for prediction
    input_data = padded_context  # Only use context for prediction in this example

    # Make a prediction
    prediction = model.predict(input_data)
    return 1 if prediction[0][0] > 0.5 else 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = np.where(data['after'] == 'yes', 1, 0)  # Assuming 'yes' indicates a valid match


Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 96ms/step - accuracy: 0.9666 - loss: 0.3219 - val_accuracy: 1.0000 - val_loss: 8.6249e-04
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 2.5489e-04
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 81ms/step - accuracy: 1.0000 - loss: 5.1793e-04 - val_accuracy: 1.0000 - val_loss: 1.7609e-04
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 126ms/step - accuracy: 1.0000 - loss: 4.3179e-04 - val_accuracy: 1.0000 - val_loss: 1.3280e-04
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 106ms/step - accuracy: 1.0000 - loss: 2.9979e-04 - val_accuracy: 1.0000 - val_loss: 1.0694e-04
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 82ms/step - accuracy: 1.0000 - loss: 2.4898e-04 - val_accuracy: 1.0000 - val_loss: 8.9

In [11]:
# Example usage
context_example = "The meeting was rescheduled to next week."
target_example = "The event will take place on Friday."
validity = predict_validity(context_example, target_example)
print(f"Context: '{context_example}'")
print(f"Target: '{target_example}'")
print(f"Predicted validity (1: valid, 0: invalid): {validity}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
Context: 'The meeting was rescheduled to next week.'
Target: 'The event will take place on Friday.'
Predicted validity (1: valid, 0: invalid): 0
