In [12]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [13]:
filename='230817_intent_labeled_by_6'

train_file = f'/home/azureuser/projects/aerius/ai/myapp/chatbot/datas/{filename}.csv'
data = pd.read_csv(train_file, delimiter = ',')
queries = data['query'].tolist()
intents = data['label'].tolist()

print(data.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                              query intent  label
0  아침도 아니고 밤 12시 30분에 결제한 건데도 그런가요?     주문    0.0
1             실수로 취소하면 재주문해야 하는 거죠?     주문    0.0
2                     택배비 따로 추가되나요?     배송    1.0
3                          택배비 있나요?     배송    1.0
4                        택배비 따로 들어요     배송    1.0
len of queries =  79617
len of intents =  79617


In [14]:
train_data = data.sample(frac=0.7, random_state=42)
temp_data = data.drop(train_data.index)
val_data = temp_data.sample(frac=0.66, random_state=42)
test_data = temp_data.drop(val_data.index)

# Oversampling the minority classes in training data
max_size = train_data['label'].value_counts().max()
lst = [train_data]
for class_index, group in train_data.groupby('label'):
    lst.append(group.sample(max_size-len(group), replace=True))
train_data_oversampled = pd.concat(lst)

# Checking the distribution after oversampling
oversampled_distribution = train_data_oversampled['label'].value_counts()

oversampled_distribution

label
1.0    24399
3.0    24399
4.0    24399
2.0    24399
5.0    24399
0.0    24399
Name: count, dtype: int64

In [15]:
queries = train_data_oversampled['query'].tolist()
intents = train_data_oversampled['label'].tolist()

print(train_data_oversampled.head(5))
print('len of queries = ', len(queries))
print('len of intents = ', len(intents))

                                                   query intent  label
31684  3만원 이상 구매해서 오늘 상품받았는데요. 3만원 이상 구매 시 슬리브리스 주신다고...     배송    1.0
5771                                          언제쯤 반품됩니까?     AS    3.0
78326                                   모델이 입은 사이즈 몇이예요?     제품    4.0
44699           추후 카드 취소와 관련된 사항 진행될 때 따로 안내해 주시는 게 있나요?     AS    3.0
57292                   어제 산 슬랙스 그린색으로 교환하려면 얼마 더 내야 해요?     AS    3.0
len of queries =  146394
len of intents =  146394


In [16]:
encoder = LabelEncoder()
encoder.fit(intents)

encoded_intents = encoder.transform(intents)
num_classes = len(encoder.classes_)
print(f"The number of unique labels is {num_classes}")

The number of unique labels is 6


In [17]:
# from ai.myapp.chatbot.utils.Preprocess import Preprocess

# # Preprocessing 객체 초기화
# p = Preprocess()

# # 첫 번째 문장 가져와서 처리
# sentence = queries[0]
# preprocessed = p.delete_intent_trash_tags(sentence=sentence)
# print("After delete_intent_trash_tags:", preprocessed)

# # 단어 리스트 생성
# words = []
# for sentence in queries:
#     if isinstance(sentence, str):  # Only process if the sentence is a string
#         preprocessed = p.delete_intent_trash_tags(sentence=sentence)
#         word_list, _ = p.divide_words_tags(preprocessed)
#         words.extend(word_list)
#     else:
#         print(f"Found non-string value: {sentence}")

# # words 리스트 확인
# print("Words List:", words[:30])  # 첫 30개의 단어만 출력
# print("Words List Length:", len(words))  # words 리스트의 길이 출력

# # 토크나이저 초기화
# p.initialize_tokenizer(words)

# # 토크나이저의 단어사전 확인
# print("Word Index:", p.tokenizer.word_index)

# # queries의 각 문장을 시퀀스로 변환
# sequences = []
# for sentence1 in queries:
#     sequence = p.text_to_sequence(sentence1)
#     sequences.append(sequence)
#     print("Original Sentence:", sentence1)
#     print("Converted Sequence:", sequence)

# # 변환된 시퀀스 확인
# print("Converted Sequences:", sequences[:20])  # 첫 20개의 시퀀스만 출력


In [18]:
from ai.myapp.chatbot.utils.Preprocess import Preprocess

# Step 1: Initialize Preprocessing object
p = Preprocess()

# Step 2: Preprocess the sentences
preprocessed_queries = []
for sentence in queries:
    if isinstance(sentence, str):  # Only process if the sentence is a string
        preprocessed = p.delete_intent_trash_tags(sentence=sentence)
        preprocessed_queries.append(preprocessed)
    else:
        print(f"Found non-string value: {sentence}")

# Print the preprocessed queries for debugging
print("Preprocessed Queries:")
print(preprocessed_queries[:5])

# Step 3: Extract words from the preprocessed queries
words = []
for preprocessed in preprocessed_queries:
    word_list, _ = p.divide_words_tags(preprocessed)
    words.extend(word_list)

# Print the extracted words for debugging
print("Extracted Words:")
print(words[:5])

# Step 4: Initialize the tokenizer with the extracted words
p.initialize_tokenizer(words)

# Step 5: Convert the preprocessed queries into sequences
sequences = []
for preprocessed in preprocessed_queries:
    word_list, _ = p.divide_words_tags(preprocessed)
    sentence = ' '.join(word_list)  # Join the words to form a sentence
    sequence = p.text_to_sequence(sentence)
    sequences.append(sequence)

# Print the converted sequences for debugging
print("Converted Sequences:")
print(sequences[:5])

# Print the tokenizer's word index for debugging
# print("Word Index:")
# print(p.tokenizer.word_index)


None
Preprocessed Queries:
[[('3', 'SN'), ('만원', 'NNP'), ('이상', 'NNG'), ('구매', 'NNG'), ('아서', 'EC'), ('오늘', 'NNG'), ('상품', 'NNG'), ('받', 'VV'), ('았', 'EP'), ('는데요', 'EF'), ('.', 'SF'), ('3', 'SN'), ('만원', 'NNP'), ('이상', 'NNG'), ('구매', 'NNG'), ('시', 'NNB'), ('슬리브리스', 'NA'), ('주', 'VX'), ('시', 'EP'), ('ㄴ다고', 'EC'), ('쓰', 'VV'), ('어', 'EC'), ('지', 'VX'), ('어', 'EC'), ('있', 'VV'), ('는데', 'EC'), ('왜', 'MAG'), ('저', 'NP'), ('안', 'MAG'), ('오', 'VV'), ('걸', 'VV'), ('ㄴ가요', 'EF'), ('?', 'SF')], [('언제', 'NP'), ('쯤', 'NNB'), ('반품', 'NNG'), ('ㅂ니까', 'EF'), ('?', 'SF')], [('모델', 'NNG'), ('입', 'VV'), ('사이즈', 'NNG'), ('몇', 'NR'), ('이', 'VCP'), ('예요', 'EF'), ('?', 'SF')], [('추후', 'NNG'), ('카드', 'NNG'), ('취소', 'NNP'), ('관련', 'NNG'), ('사항', 'NNG'), ('진행', 'NNG'), ('때', 'NNG'), ('따로', 'MAG'), ('안내', 'NNG'), ('아', 'EC'), ('주시', 'NNP'), ('게', 'NNG'), ('있', 'VX'), ('나요', 'EF'), ('?', 'SF')], [('어제', 'MAG'), ('살', 'VV'), ('슬랙스', 'NA'), ('그린', 'NNP'), ('색', 'NNG'), ('교환', 'NNG'), ('려면', 'EC'), ('얼마', 'NNG'), ('

In [19]:
print(words[:30])
print(sequences[:20])
print(sequence)


['3', '만원', '이상', '구매', '아서', '오늘', '상품', '받', '았', '는데요', '.', '3', '만원', '이상', '구매', '시', '슬리브리스', '주', '시', 'ㄴ다고', '쓰', '어', '지', '어', '있', '는데', '왜', '저', '안', '오']
[[95, 979, 282, 47, 52, 65, 34, 32, 7, 109, 95, 979, 282, 47, 11, 8638, 15, 11, 234, 210, 10, 43, 10, 2, 4, 55, 181, 20, 37, 33, 12], [17, 85, 45, 110], [296, 35, 5, 74, 3, 206], [4599, 531, 72, 1525, 872, 494, 148, 164, 604, 23, 424, 24, 2, 1], [123, 137, 505, 1132, 189, 36, 218, 168, 82, 458, 9, 62], [8639, 47, 7, 4, 629, 88, 36, 23, 15, 11, 19], [65, 6, 29, 17, 32, 14, 2, 43, 10], [989, 5, 552, 25], [17, 157, 60, 62], [1751, 10, 60, 495, 661, 23, 15, 19], [990, 1428, 5, 74, 269, 61, 1], [958, 1443, 702, 6, 70, 18, 27, 103, 49, 1382, 49, 3, 838, 16, 26, 17, 18, 27, 1], [123, 6, 7, 4, 94, 17, 9, 23, 15, 11, 110], [182, 228, 54, 6, 3, 44, 70, 94, 103, 49, 3324, 17, 42, 10, 835, 33, 12], [157, 17, 8, 33, 12], [94, 288, 263, 10, 43, 48], [34, 14, 51, 9, 23, 60, 11, 26, 4, 1975, 166, 16, 13, 67, 124, 20, 8, 10, 2, 557, 53,

In [20]:
tokenizer_json = p.tokenizer.to_json()
with open(f'/home/azureuser/projects/aerius/ai/myapp/chatbot/models/tokenizers/{filename}_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

In [21]:
print(sequence[:50])

[247, 17, 215, 14, 2, 43]


In [22]:
from ai.backend.settings import INTENT_MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=INTENT_MAX_SEQ_LEN, padding='post')

ds = tf.data.Dataset.from_tensor_slices((padded_seqs, encoded_intents))
ds = ds.shuffle(len(queries))

train_size = int(len(padded_seqs)*0.7)
val_size = int(len(padded_seqs)*0.2)
test_size = int(len(padded_seqs)*0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.tokenizer.word_index) + 1

input_layer = Input(shape=(INTENT_MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=INTENT_MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(filters = 128,
               kernel_size = 3,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(filters = 128,
               kernel_size = 4,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(filters = 128,
               kernel_size = 5,
               padding = 'valid',
               activation = tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)


concat = concatenate([pool1, pool2, pool3])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(num_classes, name='logits')(dropout_hidden)
predictions = Dense(num_classes, activation=tf.nn.softmax)(logits)

model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds, validation_data = val_ds, epochs = EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fb6ba636fd0>

In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. 모델 예측
y_pred = model.predict(test_ds)
y_pred_classes = np.argmax(y_pred, axis=1)

# test_ds_resampled에서 라벨만 추출
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# 2. 성능 지표 계산
precision = precision_score(y_true, y_pred_classes, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred_classes, average='weighted')
f1 = f1_score(y_true, y_pred_classes, average='weighted')
accuracy = accuracy_score(y_true, y_pred_classes)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 0.1658
Recall: 0.1657
F1 Score: 0.1657
Accuracy: 0.1657


In [24]:
loss, accuracy = model.evaluate(test_ds, verbose=0)

print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 0.07064834982156754
Test accuracy: 0.9763644933700562


In [25]:
loss, accuracy = model.evaluate(test_ds, verbose=1)
print(f'Accuracy: {accuracy * 100}')
print(f'loss: {loss}')

model.save(f'/home/azureuser/projects/aerius/ai/myapp/chatbot/models/models/{filename}_oversampling.h5')

Accuracy: 97.68427014350891
loss: 0.06529298424720764


  saving_api.save_model(
