In [13]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [14]:
filename='intents_labeled_by_9_first_try'

train_file = f'/home/azureuser/projects/aerius/ai/myapp/chatbot/datas/{filename}.csv'
data = pd.read_csv(train_file, delimiter = ',')
queries = data['query'].tolist()
intents = data['label'].tolist()

print(data.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                              query    intent  label
0  아침도 아니고 밤 12시 30분에 결제한 건데도 그런가요?  주문_취소_확인      7
1             실수로 취소하면 재주문해야 하는 거죠?  주문_취소_확인      7
2                     택배비 따로 추가되나요?  배송_비용_질문      5
3                          택배비 있나요?  배송_비용_질문      5
4                        택배비 따로 들어요  배송_비용_질문      5
len of queries =  203069
len of intents =  203069


In [15]:
train_data = data.sample(frac=0.7, random_state=42)
temp_data = data.drop(train_data.index)
val_data = temp_data.sample(frac=0.66, random_state=42)
test_data = temp_data.drop(val_data.index)

# Oversampling the minority classes in training data
max_size = train_data['label'].value_counts().max()
lst = [train_data]
for class_index, group in train_data.groupby('label'):
    lst.append(group.sample(max_size-len(group), replace=True))
train_data_oversampled = pd.concat(lst)

# Checking the distribution after oversampling
oversampled_distribution = train_data_oversampled['label'].value_counts()

oversampled_distribution

label
0    30947
1    30947
6    30947
4    30947
5    30947
8    30947
7    30947
3    30947
2    30947
Name: count, dtype: int64

In [16]:
queries = train_data_oversampled['query'].tolist()
intents = train_data_oversampled['label'].tolist()

print(train_data_oversampled.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                                     query    intent  label
89804                     M싸이는 판매 예정 없으세요?  제품_재고_질문      0
119566                   31 A 원단이 무겁지 않나요?  제품_정보_질문      1
134364    특대는 허리가 32 이상 되는 남자가 입어야 되는 건가여?  제품_정보_질문      1
181223  마스크 트랩 택 제거 하고 싶은데, 혹시 여기서도 가능한가요?  매장_이용_질문      6
3797             90B 에 팬티는 100으로 구매할수 없나요?  제품_구성_질문      4
len of queries =  278523
len of intents =  278523


In [17]:
encoder = LabelEncoder()
encoder.fit(intents)

encoded_intents = encoder.transform(intents)
num_classes = len(encoder.classes_)
print(f"The number of unique labels is {num_classes}")

The number of unique labels is 9


In [18]:
from ai.myapp.chatbot.utils.Preprocess import Preprocess
p = Preprocess()

words = []
for sentence in queries:
    if isinstance(sentence, str):  # Only process if the sentence is a string
        preprocessed = p.delete_intent_trash_tags(sentence=sentence)
        word_list, _ = p.divide_words_tags(preprocessed)
        words.extend(word_list)
    else:
        print(f"Found non-string value: {sentence}")


# Initialize the tokenizer
p.initialize_tokenizer(words)

# Convert the queries into sequences
sequences = []
for sentence in queries:
    sequence = p.text_to_sequence(sentence)
    sequences.append(sequence)

None


In [19]:
from ai.backend.settings import INTENT_MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=INTENT_MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, encoded_intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs)*0.7)
val_size = int(len(padded_seqs)*0.2)
test_size = int(len(padded_seqs)*0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.tokenizer.word_index) + 1

input_layer = Input(shape=(INTENT_MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=INTENT_MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(filters = 128,
               kernel_size = 3,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(filters = 128,
               kernel_size = 4,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(filters = 128,
               kernel_size = 5,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

concat = concatenate([pool1, pool2, pool3])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(num_classes, name='logits')(dropout_hidden)
predictions = Dense(num_classes, activation=tf.nn.softmax)(logits)

model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds, validation_data = val_ds, epochs = EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f94a98fc0d0>

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. 모델 예측
y_pred = model.predict(test_ds)
y_pred_classes = np.argmax(y_pred, axis=1)

# test_ds_resampled에서 라벨만 추출
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# 2. 성능 지표 계산
precision = precision_score(y_true, y_pred_classes, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred_classes, average='weighted')
f1 = f1_score(y_true, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_true, y_pred_classes)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 0.1145
Recall: 0.1142
F1 Score: 0.1120
Accuracy: 0.1142


In [21]:
loss, accuracy = model.evaluate(test_ds, verbose=0)

print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 1.065037727355957
Test accuracy: 0.6251256465911865


In [22]:
loss, accuracy = model.evaluate(test_ds, verbose=1)
print(f'Accuracy: {accuracy * 100}')
print(f'loss: {loss}')

model.save(f'/home/azureuser/projects/aerius/ai/myapp/chatbot/models/{filename}.h5')

Accuracy: 62.41203546524048
loss: 1.0693109035491943


  saving_api.save_model(
