In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd

atis_train_path = '/content/drive/MyDrive/Thesis/ML Engineer Files/atis_train.csv'
atis_test_path = '/content/drive/MyDrive/Thesis/ML Engineer Files/atis_test.csv'

atis_train = pd.read_csv(atis_train_path)
atis_test = pd.read_csv(atis_test_path)

atis_train_head = atis_train.head()
atis_test_head = atis_test.head()

atis_train_head, atis_test_head



(   id       intent                                               text  \
 0   0       flight  i want to fly from boston at 838 am and arrive...   
 1   1       flight  what flights are available from pittsburgh to ...   
 2   2  flight_time  what is the arrival time in san francisco for ...   
 3   3      airfare            cheapest airfare from tacoma to orlando   
 4   4      airfare  round trip fares from pittsburgh to philadelph...   
 
                                                slots  
 0  O O O O O B-fromloc.city_name O B-depart_time....  
 1  O O O O O B-fromloc.city_name O B-toloc.city_n...  
 2  O O O B-flight_time I-flight_time O B-fromloc....  
 3  B-cost_relative O O B-fromloc.city_name O B-to...  
 4  B-round_trip I-round_trip O O B-fromloc.city_n...  ,
    id   intent                                               text  \
 0   0   flight  i would like to find a flight from charlotte t...   
 1   1  airfare  on april first i need a ticket from tacoma to ...   
 2   2 

In [9]:
def manual_preprocess_data(df):
    processed_data = []

    for _, row in df.iterrows():
        text = row['text']
        slots = row['slots'].split()


        tokens = text.split()

        if len(tokens) != len(slots):
            print(f"Misalignment in sentence: {text}")
            continue

        processed_data.append((tokens, slots))

    return processed_data

# Preprocess the training data
preprocessed_train_data = manual_preprocess_data(atis_train)
preprocessed_train_data[:5]


[(['i',
   'want',
   'to',
   'fly',
   'from',
   'boston',
   'at',
   '838',
   'am',
   'and',
   'arrive',
   'in',
   'denver',
   'at',
   '1110',
   'in',
   'the',
   'morning'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'B-fromloc.city_name',
   'O',
   'B-depart_time.time',
   'I-depart_time.time',
   'O',
   'O',
   'O',
   'B-toloc.city_name',
   'O',
   'B-arrive_time.time',
   'O',
   'O',
   'B-arrive_time.period_of_day']),
 (['what',
   'flights',
   'are',
   'available',
   'from',
   'pittsburgh',
   'to',
   'baltimore',
   'on',
   'thursday',
   'morning'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'B-fromloc.city_name',
   'O',
   'B-toloc.city_name',
   'O',
   'B-depart_date.day_name',
   'B-depart_time.period_of_day']),
 (['what',
   'is',
   'the',
   'arrival',
   'time',
   'in',
   'san',
   'francisco',
   'for',
   'the',
   '755',
   'am',
   'flight',
   'leaving',
   'washington'],
  ['O',
   'O',
   'O',
   'B-flight_time',
   'I-flight_time',
  

In [10]:
!pip install tensorflow




In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

token_vocab = set([token for sentence, _ in preprocessed_train_data for token in sentence])
slot_label_vocab = set([slot for _, slots in preprocessed_train_data for slot in slots])

token_tokenizer = Tokenizer(filters='')
slot_label_tokenizer = Tokenizer(filters='')
token_tokenizer.fit_on_texts(token_vocab)
slot_label_tokenizer.fit_on_texts(slot_label_vocab)


train_sentences = token_tokenizer.texts_to_sequences([sentence for sentence, _ in preprocessed_train_data])
train_slot_labels = slot_label_tokenizer.texts_to_sequences([slots for _, slots in preprocessed_train_data])

MAX_LEN = max([len(sentence) for sentence in train_sentences])
train_sentences = pad_sequences(train_sentences, maxlen=MAX_LEN, padding='post')
train_slot_labels = pad_sequences(train_slot_labels, maxlen=MAX_LEN, padding='post')
num_slot_labels = len(slot_label_tokenizer.word_index) + 1

train_slot_labels = np.array([np.eye(num_slot_labels)[np.clip(slots, 0, num_slot_labels-1)] for slots in train_slot_labels])




In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense


EMBEDDING_DIM = 50
LSTM_UNITS = 64

input_layer = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=len(token_tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_LEN)(input_layer)
model = Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True))(model)
model = TimeDistributed(Dense(len(slot_label_tokenizer.word_index) + 1, activation='softmax'))(model)
ner_model = Model(input_layer, model)
ner_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

ner_model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 46)]              0         
                                                                 
 embedding (Embedding)       (None, 46, 50)            44500     
                                                                 
 bidirectional (Bidirection  (None, 46, 128)           58880     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 46, 124)           15996     
 ributed)                                                        
                                                                 
Total params: 119376 (466.31 KB)
Trainable params: 119376 (466.31 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
ner_model.fit(train_sentences, train_slot_labels, batch_size=32, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7a28975b10c0>

In [15]:
# Preprocess the test data
preprocessed_test_data = manual_preprocess_data(atis_test)

# Convert test sentences and slot labels to sequences of IDs
test_sentences = token_tokenizer.texts_to_sequences([sentence for sentence, _ in preprocessed_test_data])
test_slot_labels = slot_label_tokenizer.texts_to_sequences([slots for _, slots in preprocessed_test_data])

# Pad the sequences
test_sentences = pad_sequences(test_sentences, maxlen=MAX_LEN, padding='post')
test_slot_labels = pad_sequences(test_slot_labels, maxlen=MAX_LEN, padding='post')

# Convert slot labels to categorical format
test_slot_labels = np.array([np.eye(len(slot_label_tokenizer.word_index) + 1)[slots] for slots in test_slot_labels])


In [16]:
test_loss, test_accuracy = ner_model.evaluate(test_sentences, test_slot_labels)
print(f"Test Accuracy: {test_accuracy}")


Test Accuracy: 0.9606115221977234


In [17]:
# Make predictions on the test data
test_predictions = ner_model.predict(test_sentences)




In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Flatten the predictions and true labels
flat_test_predictions = [pred for sentence in np.argmax(test_predictions, axis=-1) for pred in sentence]
flat_true_test_slot_labels = [true_label for sentence in np.argmax(test_slot_labels, axis=-1) for true_label in sentence]

# Initialize LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(list(slot_label_tokenizer.word_index.keys()))

# Get target names
target_names = label_encoder.inverse_transform(range(len(slot_label_tokenizer.word_index)))

# Generate classification report
report = classification_report(flat_true_test_slot_labels, flat_test_predictions, target_names=target_names, labels=range(len(slot_label_tokenizer.word_index)), zero_division=0)

print(report)


                              precision    recall  f1-score   support

             b-aircraft_code       0.99      1.00      1.00     31886
              b-airline_code       0.00      0.00      0.00         1
              b-airline_name       0.00      0.00      0.00        31
              b-airport_code       0.00      0.00      0.00         6
              b-airport_name       0.00      0.00      0.00        17
 b-arrive_date.date_relative       0.00      0.00      0.00        10
      b-arrive_date.day_name       0.00      0.00      0.00         0
    b-arrive_date.day_number       0.00      0.00      0.00         9
    b-arrive_date.month_name       0.00      0.00      0.00        31
b-arrive_date.today_relative       0.00      0.00      0.00         3
      b-arrive_time.end_time       0.00      0.00      0.00         0
    b-arrive_time.period_mod       0.00      0.00      0.00         4
 b-arrive_time.period_of_day       0.00      0.00      0.00         0
    b-arrive_time.s

In [20]:
ner_model.save('/content/drive/MyDrive/Thesis/ML Engineer Files/ner_model_atis.h5')


In [21]:
from tensorflow.keras.models import load_model

loaded_model = load_model('/content/drive/MyDrive/Thesis/ML Engineer Files/ner_model_atis.h5')


In [26]:
def extract_return_date_time(model, input_text, token_tokenizer, slot_label_tokenizer, max_len):
    # Tokenize and pad the input text
    sequence = token_tokenizer.texts_to_sequences([input_text.split()])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Predict using the model
    prediction = model.predict(padded_sequence)
    prediction_labels = np.argmax(prediction, axis=-1)[0]

    tokens = input_text.split()
    predicted_slots = [slot_label_tokenizer.index_word.get(label, 'O') for label in prediction_labels[:len(tokens)]]


    # Extracting return date and time entities
    return_entities = []
    for token, slot_label in zip(tokens, predicted_slots):
        if 'return_date' in slot_label or 'return_time' in slot_label:
            return_entities.append(token)

    return ' '.join(return_entities) if return_entities else "No return date/time detected."

# Example usage
input_text = "I want to book a flight to Dhaka tomorrow and return  Friday at 11 AM."
return_info = extract_return_date_time(loaded_model, input_text, token_tokenizer, slot_label_tokenizer, MAX_LEN)
print(return_info)


No return date/time detected.
