In [9]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [10]:
filename='230814_intent_labeled_by_7_under_sampling'

train_file = f'/home/azureuser/projects/aerius/ai/myapp/chatbot/datas/{filename}.csv'
data = pd.read_csv(train_file, delimiter = ',')
queries = data['query'].tolist()
intents = data['label'].tolist()

print(data.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                              query intent  label
0  아침도 아니고 밤 12시 30분에 결제한 건데도 그런가요?     주문      0
1             실수로 취소하면 재주문해야 하는 거죠?     주문      0
2                     택배비 따로 추가되나요?     배송      1
3                          택배비 있나요?     배송      1
4                        택배비 따로 들어요     배송      1
len of queries =  176625
len of intents =  176625


In [11]:
class_distribution = data['label'].value_counts()
print("Initial class distribution:\n", class_distribution)

Initial class distribution:
 label
6    62199
4    37356
1    35110
3    20243
5    17249
2     3894
0      574
Name: count, dtype: int64


In [12]:
min_class_size = class_distribution.min()

undersampled_data_list = [data[data['label'] == class_label].sample(min_class_size, random_state=42) 
                         for class_label in class_distribution.index]

undersampled_data = pd.concat(undersampled_data_list)

undersampled_class_distribution = undersampled_data['label'].value_counts()
print("\nUndersampled class distribution:\n", undersampled_class_distribution)




Undersampled class distribution:
 label
6    574
4    574
1    574
3    574
5    574
2    574
0    574
Name: count, dtype: int64


In [13]:
queries = undersampled_data['query'].tolist()
intents = undersampled_data['label'].tolist()

print(undersampled_data.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                          query intent  label
97549      옷이 사진과 너무 다른 거 아닌가요?  제품_기타      6
6587        나이트슬립 종류별로 살 수 있나요?  제품_기타      6
176207     목걸이에다가 제 이름 각인해 주세요.  제품_기타      6
85364   지금 품절된 이 자켓은 언제쯤 들어올까요?  제품_기타      6
61397       베이직 보드복 카키색 재입고됩니까?  제품_기타      6
len of queries =  4018
len of intents =  4018


In [14]:
encoder = LabelEncoder()
encoder.fit(intents)

encoded_intents = encoder.transform(intents)
num_classes = len(encoder.classes_)
print(f"The number of unique labels is {num_classes}")

The number of unique labels is 7


In [15]:
from ai.myapp.chatbot.utils.Preprocess import Preprocess
p = Preprocess()

words = []
for sentence in queries:
    if isinstance(sentence, str):  # Only process if the sentence is a string
        preprocessed = p.delete_intent_trash_tags(sentence=sentence)
        word_list, _ = p.divide_words_tags(preprocessed)
        words.extend(word_list)
    else:
        print(f"Found non-string value: {sentence}")


# Initialize the tokenizer
p.initialize_tokenizer(words)

# Convert the queries into sequences
sequences = []
for sentence in queries:
    sequence = p.text_to_sequence(sentence)
    sequences.append(sequence)

None


In [16]:
from ai.backend.settings import INTENT_MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=INTENT_MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, encoded_intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs)*0.7)
val_size = int(len(padded_seqs)*0.2)
test_size = int(len(padded_seqs)*0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.tokenizer.word_index) + 1

input_layer = Input(shape=(INTENT_MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=INTENT_MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(filters = 64,
               kernel_size = 3,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(filters = 64,
               kernel_size = 4,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)


concat = concatenate([pool1, pool2])

hidden = Dense(64, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(num_classes, name='logits')(dropout_hidden)
predictions = Dense(num_classes, activation=tf.nn.softmax)(logits)

model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds, validation_data = val_ds, epochs = EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f2d8c567550>

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. 모델 예측
y_pred = model.predict(test_ds)
y_pred_classes = np.argmax(y_pred, axis=1)

# test_ds_resampled에서 라벨만 추출
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# 2. 성능 지표 계산
precision = precision_score(y_true, y_pred_classes, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred_classes, average='weighted')
f1 = f1_score(y_true, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_true, y_pred_classes)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 0.1443
Recall: 0.1471
F1 Score: 0.1396
Accuracy: 0.1471


In [18]:
loss, accuracy = model.evaluate(test_ds, verbose=0)

print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 1.2314119338989258
Test accuracy: 0.5735660791397095


In [19]:
loss, accuracy = model.evaluate(test_ds, verbose=1)
print(f'Accuracy: {accuracy * 100}')
print(f'loss: {loss}')

model.save(f'/home/azureuser/projects/aerius/ai/myapp/chatbot/models/{filename}_lower_dense.h5')

Accuracy: 57.605987787246704
loss: 1.2345458269119263


  saving_api.save_model(
