In [11]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [12]:
filename='230817_intent_labeled_by_7'

train_file = f'/home/azureuser/projects/aerius/ai/myapp/chatbot/datas/{filename}.csv'
data = pd.read_csv(train_file, delimiter = ',')
queries = data['query'].tolist()
intents = data['label'].tolist()

print(data.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

        query intent  label
0   배송비용 있나요?     배송    0.0
1   수선비 얼마지요?     배송    0.0
2     무료배송이죠?     배송    0.0
3  배송비는 얼마에요?     배송    0.0
4   아 배송비는 요?     배송    0.0
len of queries =  93099
len of intents =  93099


In [13]:
train_data = data.sample(frac=0.7, random_state=42)
temp_data = data.drop(train_data.index)
val_data = temp_data.sample(frac=0.66, random_state=42)
test_data = temp_data.drop(val_data.index)

# Oversampling the minority classes in training data
max_size = train_data['label'].value_counts().max()
lst = [train_data]
for class_index, group in train_data.groupby('label'):
    lst.append(group.sample(max_size-len(group), replace=True))
train_data_oversampled = pd.concat(lst)

# Checking the distribution after oversampling
oversampled_distribution = train_data_oversampled['label'].value_counts()

oversampled_distribution

label
0.0    20949
4.0    20949
2.0    20949
3.0    20949
1.0    20949
Name: count, dtype: int64

In [14]:
queries = train_data_oversampled['query'].tolist()
intents = train_data_oversampled['label'].tolist()

print(train_data_oversampled.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                                                   query intent  label
9129        급하게 입어야 하는데, 오늘 앙고라 니트 가디건 주문하면 내일 받을 수 있나요?     배송    0.0
80619                    긴팔 티셔츠는 요긴하게 잘 쓰여서 다른 싸이즈도 봐주세요  제품_재고    4.0
86234  같은 옷인데 안감이 누비와 양털이 두 종류가 있던데 지금 판매하는 속안감이 어떤 종...  제품_재고    4.0
90648                           선물세트 중에 설화수에서 나온 것도 있나요?  제품_재고    4.0
36173                                   혹시 언제 환불 처리하셨어요?     AS    2.0
len of queries =  104745
len of intents =  104745


In [15]:
encoder = LabelEncoder()
encoder.fit(intents)

encoded_intents = encoder.transform(intents)
num_classes = len(encoder.classes_)
print(f"The number of unique labels is {num_classes}")

The number of unique labels is 5


In [16]:
from ai.myapp.chatbot.utils.Preprocess import Preprocess
p = Preprocess()

words = []
for sentence in queries:
    if isinstance(sentence, str):  # Only process if the sentence is a string
        preprocessed = p.delete_intent_trash_tags(sentence=sentence)
        word_list, _ = p.divide_words_tags(preprocessed)
        words.extend(word_list)
    else:
        print(f"Found non-string value: {sentence}")


# Initialize the tokenizer
p.initialize_tokenizer(words)

# Convert the queries into sequences
sequences = []
for sentence in queries:
    sequence = p.text_to_sequence(sentence)
    sequences.append(sequence)

None


In [17]:
from ai.backend.settings import INTENT_MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=INTENT_MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, encoded_intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs)*0.7)
val_size = int(len(padded_seqs)*0.2)
test_size = int(len(padded_seqs)*0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.tokenizer.word_index) + 1

input_layer = Input(shape=(INTENT_MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=INTENT_MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(filters = 64,
               kernel_size = 3,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(filters = 64,
               kernel_size = 4,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)


concat = concatenate([pool1, pool2])

hidden = Dense(64, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(num_classes, name='logits')(dropout_hidden)
predictions = Dense(num_classes, activation=tf.nn.softmax)(logits)

model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds, validation_data = val_ds, epochs = EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fc78821a730>

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. 모델 예측
y_pred = model.predict(test_ds)
y_pred_classes = np.argmax(y_pred, axis=1)

# test_ds_resampled에서 라벨만 추출
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# 2. 성능 지표 계산
precision = precision_score(y_true, y_pred_classes, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred_classes, average='weighted')
f1 = f1_score(y_true, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_true, y_pred_classes)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 0.1998
Recall: 0.2007
F1 Score: 0.1993
Accuracy: 0.2007


In [19]:
loss, accuracy = model.evaluate(test_ds, verbose=0)

print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 0.5617595911026001
Test accuracy: 0.7744892239570618


In [20]:
loss, accuracy = model.evaluate(test_ds, verbose=1)
print(f'Accuracy: {accuracy * 100}')
print(f'loss: {loss}')

model.save(f'/home/azureuser/projects/aerius/ai/myapp/chatbot/models/models/{filename}_2.h5')

Accuracy: 77.83082127571106
loss: 0.5673361420631409


  saving_api.save_model(
