Setup and Imports

In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# Load data
file_path = '../data/test_cases.xlsx'
df = pd.read_excel(file_path)

# Ensure correct column names
X = df['Input_Sentence']
y_location = df['Expected_Destination']
y_start_date = df['Expected_Start_Date']
y_end_date = df['Expected_End_Date']
y_num_people = df['Expected_Number_of_People']

# Encode labels
label_encoders = {}
for col in ['Expected_Destination', 'Expected_Start_Date', 'Expected_End_Date', 'Expected_Number_of_People']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split the data
X_train, X_test, y_location_train, y_location_test = train_test_split(X, df['Expected_Destination'], test_size=0.2, random_state=42)
_, _, y_start_date_train, y_start_date_test = train_test_split(X, df['Expected_Start_Date'], test_size=0.2, random_state=42)
_, _, y_end_date_train, y_end_date_test = train_test_split(X, df['Expected_End_Date'], test_size=0.2, random_state=42)
_, _, y_num_people_train, y_num_people_test = train_test_split(X, df['Expected_Number_of_People'], test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)


# Determine the max sequence length
X_train_seq = tokenizer.texts_to_sequences(X_train)
max_sequence_length = max(len(seq) for seq in X_train_seq)
X_train_seq = pad_sequences(X_train_seq, padding='post', maxlen=max_sequence_length)

Model Training

In [49]:
def preprocess_text(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, padding='post', maxlen=max_sequence_length)

X_test_seq = preprocess_text(X_test)

# Define the LSTM model
def create_lstm_model(output_dim):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate the model
def train_and_evaluate(X_train_seq, y_train, X_test_seq, y_test, output_dim, model_name):
    model = create_lstm_model(output_dim)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(X_train_seq, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping])
    
    # Print validation accuracy for each epoch
    for epoch, val_acc in enumerate(history.history['val_accuracy']):
        print(f'Epoch {epoch + 1}: Validation Accuracy = {val_acc}')
    
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test_seq, y_test)
    print(f'{model_name} Model Test Accuracy: {accuracy}')
    
    return model

# Train models for each target variable
location_model = train_and_evaluate(X_train_seq, y_location_train, X_test_seq, y_location_test, len(label_encoders['Expected_Destination'].classes_), 'Location')
start_date_model = train_and_evaluate(X_train_seq, y_start_date_train, X_test_seq, y_start_date_test, len(label_encoders['Expected_Start_Date'].classes_), 'Start Date')
end_date_model = train_and_evaluate(X_train_seq, y_end_date_train, X_test_seq, y_end_date_test, len(label_encoders['Expected_End_Date'].classes_), 'End Date')
num_people_model = train_and_evaluate(X_train_seq, y_num_people_train, X_test_seq, y_num_people_test, len(label_encoders['Expected_Number_of_People'].classes_), 'Number of People')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 1: Validation Accuracy = 0.5613750219345093
Epoch 2: Validation Accuracy = 0.9748125076293945
Epoch 3: Validation Accuracy = 0.9868749976158142
Epoch 4: Validation Accuracy = 0.9959999918937683
Epoch 5: Validation Accuracy = 1.0
Epoch 6: Validation Accuracy = 1.0
Epoch 7: Validation Accuracy = 1.0
Epoch 8: Validation Accuracy = 1.0
Epoch 9: Validation Accuracy = 1.0
Epoch 10: Validation Accuracy = 1.0
Epoch 11: Validation Accuracy = 1.0
Epoch 12: Validation Accuracy = 1.0
Epoch 13: Validation Accuracy = 1.0
Epoch 14: Validation Accuracy = 0.9956874847412109
Location Model Test Accuracy: 1.0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 1: Validation Accuracy = 0.5714374780654907
Epoch 2: Validation Accuracy = 0.9574375152587891
Epoch 3: Valid

Save Models

In [50]:
location_model.save('location_model.h5')
start_date_model.save('start_date_model.h5')
end_date_model.save('end_date_model.h5')
num_people_model.save('num_people_model.h5')

Evaluate Models

In [51]:
# Load and use the models
location_model = tf.keras.models.load_model('location_model.h5')
start_date_model = tf.keras.models.load_model('start_date_model.h5')
end_date_model = tf.keras.models.load_model('end_date_model.h5')
num_people_model = tf.keras.models.load_model('num_people_model.h5')

# Example prediction
input_sentence = "Let's have an adventure to Cambridge from 3/19 to October 5th with 3 buddies"
input_seq = preprocess_text([input_sentence])

location_pred = np.argmax(location_model.predict(input_seq), axis=-1)[0]
start_date_pred = np.argmax(start_date_model.predict(input_seq), axis=-1)[0]
end_date_pred = np.argmax(end_date_model.predict(input_seq), axis=-1)[0]
num_people_pred = np.argmax(num_people_model.predict(input_seq), axis=-1)[0]

location = label_encoders['Expected_Destination'].inverse_transform([location_pred])[0]
start_date = label_encoders['Expected_Start_Date'].inverse_transform([start_date_pred])[0]
end_date = label_encoders['Expected_End_Date'].inverse_transform([end_date_pred])[0]
num_people = label_encoders['Expected_Number_of_People'].inverse_transform([num_people_pred])[0]

print(f"Location: {location}")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Number of People: {num_people}")

Location:  Cambridge
Start Date:  3/19
End Date:  October 5th
Number of People: 3


In [53]:


# Define max sequence length as per the model's expected input
max_sequence_length = 22

# Preprocess text function
def preprocess_text(texts, tokenizer):
    # Tokenize the text
    sequences = tokenizer.texts_to_sequences(texts)
    # Pad the sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
    return padded_sequences

# Load models
location_model = tf.keras.models.load_model('location_model.h5')
start_date_model = tf.keras.models.load_model('start_date_model.h5')
end_date_model = tf.keras.models.load_model('end_date_model.h5')
num_people_model = tf.keras.models.load_model('num_people_model.h5')

# Load the trained tokenizer
# Replace 'tokenizer.pkl' with the actual path to your saved tokenizer


# Example prediction
input_sentence = "Let's have an adventure to Cambridge from 3/19 to October 5th with 3 buddies"
input_seq = preprocess_text([input_sentence], tokenizer)

# Predict using the models
location_pred = np.argmax(location_model.predict(input_seq), axis=-1)[0]
start_date_pred = np.argmax(start_date_model.predict(input_seq), axis=-1)[0]
end_date_pred = np.argmax(end_date_model.predict(input_seq), axis=-1)[0]
num_people_pred = np.argmax(num_people_model.predict(input_seq), axis=-1)[0]

location = label_encoders['Expected_Destination'].inverse_transform([location_pred])[0]
start_date = label_encoders['Expected_Start_Date'].inverse_transform([start_date_pred])[0]
end_date = label_encoders['Expected_End_Date'].inverse_transform([end_date_pred])[0]
num_people = label_encoders['Expected_Number_of_People'].inverse_transform([num_people_pred])[0]

print(f"Location: {location}")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Number of People: {num_people}")


Location:  Cambridge
Start Date:  3/19
End Date:  October 5th
Number of People: 3
