# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data_cs.csv")

In [3]:
df

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account
...,...,...,...,...
21529,BILQC,"i have no shipping address, what do i have to ...",SHIPPING,set_up_shipping_address
21530,BLQC,I have no shipping address and I want to set o...,SHIPPING,set_up_shipping_address
21531,BIQC,"i want to set up my shipping address, what do ...",SHIPPING,set_up_shipping_address
21532,BILC,"I don't have a shipping address, can you set o...",SHIPPING,set_up_shipping_address


In [4]:
df["intent"].unique()

array(['create_account', 'delete_account', 'edit_account',
       'recover_password', 'registration_problems', 'switch_account',
       'check_cancellation_fee', 'contact_customer_service',
       'contact_human_agent', 'delivery_options', 'delivery_period',
       'complaint', 'review', 'check_invoices', 'get_invoice',
       'newsletter_subscription', 'cancel_order', 'change_order',
       'place_order', 'track_order', 'check_payment_methods',
       'payment_issue', 'check_refund_policy', 'get_refund',
       'track_refund', 'change_shipping_address',
       'set_up_shipping_address'], dtype=object)

In [5]:
df["category"].unique()

array(['ACCOUNT', 'CANCELLATION_FEE', 'CONTACT', 'DELIVERY', 'FEEDBACK',
       'INVOICES', 'NEWSLETTER', 'ORDER', 'PAYMENT', 'REFUNDS',
       'SHIPPING'], dtype=object)

In [6]:
df[df["intent"] == "review"]

Unnamed: 0,flags,utterance,category,intent
9245,BILC,"I'm happy with the service, can I submit a rev...",FEEDBACK,review
9246,BILC,"I'm happy with the service, what can I do to f...",FEEDBACK,review
9247,BILC,"I want to file a comment for a service, what c...",FEEDBACK,review
9248,BICZ,"I'm happy with the service , can I file a review?",FEEDBACK,review
9249,BILC,"I'm happy with the service, what do I have to ...",FEEDBACK,review
...,...,...,...,...
9820,BILCZ,"I'm hqppy with the service, what should I do t...",FEEDBACK,review
9821,BILC,"I'm happy with the service, what should I do t...",FEEDBACK,review
9822,BIPLQD,can u ask an agent if i could write a comment?,FEEDBACK,review
9823,BILQD,can u ask an agent how i can leave an opinion ...,FEEDBACK,review


In [7]:
df.isna().sum()

flags        0
utterance    0
category     0
intent       0
dtype: int64

In [8]:
df_cleaned = df.drop(["flags","category"],axis=1)
df_cleaned

Unnamed: 0,utterance,intent
0,"I don't have an online account, what do I have...",create_account
1,can you tell me if i can regisger two accounts...,create_account
2,"I have no online account, open one, please",create_account
3,"could you ask an agent how to open an account,...",create_account
4,"i want an online account, create one",create_account
...,...,...
21529,"i have no shipping address, what do i have to ...",set_up_shipping_address
21530,I have no shipping address and I want to set o...,set_up_shipping_address
21531,"i want to set up my shipping address, what do ...",set_up_shipping_address
21532,"I don't have a shipping address, can you set o...",set_up_shipping_address


# Intent Recognition

In [10]:
# Import Libraries
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [11]:
unique_intents = ['create_account', 'delete_account', 'edit_account',
       'recover_password', 'registration_problems', 'switch_account',
       'check_cancellation_fee', 'contact_customer_service',
       'contact_human_agent', 'delivery_options', 'delivery_period',
       'complaint', 'review', 'check_invoices', 'get_invoice',
       'newsletter_subscription', 'cancel_order', 'change_order',
       'place_order', 'track_order', 'check_payment_methods',
       'payment_issue', 'check_refund_policy', 'get_refund',
       'track_refund', 'change_shipping_address',
       'set_up_shipping_address']

In [12]:
intents = df_cleaned["intent"]

label_encoder=LabelEncoder()
# Fit the encoder to the 'intent' column
label_encoder.fit(intents)

# Transform the categorical labels into integers
intents = label_encoder.transform(intents)
print(intents[0])
print(label_encoder.inverse_transform([intents[0]]))
num_classes = len(set(intents))
intents = to_categorical(intents, num_classes=num_classes) # transform labels


intents   

10
['create_account']


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
texts = df_cleaned["utterance"]

max_words = 100
tokenizer = Tokenizer(num_words=max_words) 

tokenizer.fit_on_texts(texts) #fit/load the dataset texts

sequences = tokenizer.texts_to_sequences(texts) #transform the text into sequences -> text are turned into ids that corellate to the ids of that word in the vocab

X = pad_sequences(sequences, maxlen=100)

In [14]:
X = np.array(X) #turn into np arr
intents = np.array(intents) #turn into np arr

X_train, X_test, y_train, y_test = train_test_split(X, intents, test_size=0.2, random_state=11)

# Split the training set further into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=11)

In [None]:
# #reduced size for testing only

# X = np.array(X) #turn into np arr
# intents = np.array(intents) #turn into np arr

# # Generate a random permutation of indices for shuffling
# np.random.seed(42)
# indices = np.random.permutation(len(X))

# # Shuffle both datasets using the same permutation
# X_shuffled = X[indices][:1000]
# intents_shuffled = intents[indices][:1000]

# X_train, X_test, y_train, y_test = train_test_split(X_shuffled, intents_shuffled, test_size=0.2, random_state=11) #split data with train_test_split

In [15]:
# Create RNN Model Ver1:  Random init
model_rnn = Sequential() # Create an empty Sequential model

model_rnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=X.shape[1])) #Embedding layer, turn inputs(sequences) into vectors, with weights set random
model_rnn.add(Bidirectional(LSTM(64, return_sequences=True)))                          #Bidirectional layer, process words in both directions, so it can see the forward and backward dependencies
model_rnn.add(Bidirectional(LSTM(32)))                                                 #Bidirectional layer, Reduce size to 32 (to reduce complexity)
model_rnn.add(Dense(num_classes, activation='softmax'))                                #Dense layer/Activation layer with softmax(since its multiclass)

model_rnn.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])    #Compile the model, add loss function(diff between prediction and target) and optimizer adam(Adaptive Moment estimation)





In [16]:
# Train Model
model_rnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)) #Train the data, epochs = how many epochs, batch_size = how big the sample used, validation data = data used for evaluation

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1d958d9fed0>

In [23]:
model_rnn.save("model.keras")

In [18]:
loaded_model = tf.keras.models.load_model("model.keras")

In [20]:
loss, accuracy = loaded_model.evaluate(X_val, y_val)
print("------------------------Random------------------------")
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')  #Hasil metrik evaluasi


------------------------Random------------------------
Test loss: 0.0895, Test accuracy: 0.9761


In [21]:
user_texts = [
    "I want to contact a human agent",
    "Can you tell me the status of my order #12345? It was supposed to arrive today.",
    "can i get a refund",
    "I tried to make a purchase, but my payment was declined. What should I do?",
    "Do you offer express shipping? How much does it cost?",
    "help",
    "I'm having trouble logging into my account. Can you help?",
    "Do you sell gift cards? If so, how can I purchase one?",
    "Is the PlayStation 5 in stock? I tried ordering it, but it's showing out of stock.",
    "I dont understand your UI, it is confusing me, where to submit a review or comment?"
  ]

X = pad_sequences(sequences, maxlen=100)
predictions = model_rnn.predict(pad_sequences(tokenizer.texts_to_sequences(user_texts), maxlen=100))
for text, prediction in zip(user_texts,predictions): 
    predicted_class = np.argmax(prediction)
    predicted_class = label_encoder.inverse_transform([predicted_class])
    for num in prediction:
        print("{:.3f}".format(num),end = "|")
    print()
    print(text)
    print(predicted_class)
    print("--------------------------------")

0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.999|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|
I want to contact a human agent
['contact_human_agent']
--------------------------------
0.001|0.007|0.000|0.000|0.001|0.000|0.001|0.000|0.000|0.000|0.000|0.000|0.000|0.001|0.000|0.001|0.000|0.000|0.000|0.001|0.000|0.000|0.000|0.000|0.000|0.979|0.006|
Can you tell me the status of my order #12345? It was supposed to arrive today.
['track_order']
--------------------------------
0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.001|0.000|0.000|0.000|0.000|0.000|0.002|0.994|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|
can i get a refund
['get_refund']
--------------------------------
0.000|0.000|0.000|0.000|0.000|0.001|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.998|0.000|0.000|0.000|0.000|0.000|0.000|0.000|0.000|
I tried to make a purchase, but my payment was declined. What s

In [None]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=32ab40da-eddc-4f38-b99a-9e8daf16c87f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>