# Task 02 - kodelabs

## import dependencies

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

## Load and preprocess the dataset

In [38]:
dataset = pd.read_csv("dataset.csv")
dataset['Examples'] = dataset['Examples'].str.lower().str.replace('[^\w\s]', '', regex=True)

  dataset['Examples'] = dataset['Examples'].str.lower().str.replace('[^\w\s]', '', regex=True)


## Tokenize and Labels

In [39]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['Examples'])
max_len = max(len(seq) for seq in tokenizer.texts_to_sequences(dataset['Examples']))
X_seq = tokenizer.texts_to_sequences(dataset['Examples'])
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(dataset['Intent'])


## Build Train and Test

In [40]:
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
optimizer = Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(X_pad, y_encoded, epochs=100, batch_size=100)  # Increased epochs to 30

def classify_intent(text, threshold=0.7):
    text = text.lower().replace('[^\w\s]', '')
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    confidence = np.max(model.predict(padded_sequence))
    if confidence >= threshold:
        prediction = np.argmax(model.predict(padded_sequence))
        intent = label_encoder.inverse_transform([prediction])[0]
        return intent, confidence
    else:
        return "NLU fallback: Intent could not be confidently determined", confidence

test_texts = ["This is the reason for it", "Stay hydrated", "How about we experiment with new ideas?", "This isn't what I ordered"]
for text in test_texts:
    intent, confidence = classify_intent(text)
    print(f"Text: '{text}'")
    print(f"Intent: {intent}, Confidence: {confidence:.4f}")
    print()


Epoch 1/100


  text = text.lower().replace('[^\w\s]', '')


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.0882 - loss: 2.8127
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.3037 - loss: 2.5523
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.4231 - loss: 2.1479
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4719 - loss: 1.7485
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5511 - loss: 1.5211
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6712 - loss: 1.1472
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7241 - loss: 0.9256
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.8156 - loss: 0.7362
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0