# 모델 선택 및 학습
## 모델 선택

In [1]:
import os
import tensorflow as tf
# GPU 메모리 사용 점진적 할당 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [2]:
# 테스트 환경 속 설정 변수
epochs = 20
batch_size = 32
validation_split = 0.2

In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 파일 경로 설정
train_file_path = 'Data/processed_train2.csv'
test_file_path = 'Data/processed_test2.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)


# 텍스트 정제 함수
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # 다중 공백 제거
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    return text.strip()

train_data['cleaned_conversation'] = train_data['conversation'].apply(clean_text)
train_data['text_length'] = train_data['cleaned_conversation'].apply(len)

# 토크나이저 설정
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['cleaned_conversation'])

# 시퀀스 변환 및 패딩
sequences = tokenizer.texts_to_sequences(train_data['cleaned_conversation'])
max_length = 300
X_data = pad_sequences(sequences, maxlen=max_length, padding='post')

# 레이블 인코딩
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(train_data['class'])

# 결과 확인
print('토크나이저 단어 개수:', len(tokenizer.word_index))
print('X_data 크기:', X_data.shape)
print('y_data 크기:', y_data.shape)

토크나이저 단어 개수: 43385
X_data 크기: (4552, 300)
y_data 크기: (4552,)


In [4]:
padded_sequences = X_data
labels = y_data

## klue/roberta-base

In [5]:
from transformers import pipeline
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# 로버타 모델과 토그나이저 설정
pipe = pipeline('text-classification', model='klue/roberta-base-nli')

tokenizer = pipe.tokenizer

# 데이터 전처리 함수
def encode_texts(texts, max_length=128):
    inputs = tokenizer(texts.tolist(), max_length=max_length, padding='max_length', truncation=True, return_tensors='tf')
    return inputs.data

# 데이터 로드 및 전처리
texts = train_data['cleaned_conversation'].tolist()
labels = train_data['class'].tolist()

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

X_train_encoded = encode_texts(X_train)
X_val_encoded = encode_texts(X_val)

# 모델 구성
input_ids = Input(shape=(128,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(128,), name='attention_mask', dtype='int32')

roberta_output = pipe(input_ids, attention_mask)

output = Dense(7, activation='softmax')(roberta_output[0][:, 0, :])

model = Model(inputs=[input_ids, attention_mask], outputs=output)

# 모델 컴파일
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 128)          5553408   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
output (Dense)               (None, 4)                 260       
Total params: 5,603,076
Trainable params: 5,603,076
Non-trainable params: 0
_________________________________________________________________


In [6]:
# 모델 학습
history = model.fit(
    [X_train_encoded.data, X_train_encoded.attention_mask], 
    np.array(y_train), 
    validation_data=([X_val_encoded.data, X_val_encoded.attention_mask], np.array(y_val)), 
    epochs=epochs, 
    batch_size=batch_size
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
model_name = 'robera_base'
model.save(f'Model/{model_name}.h5')

In [14]:
from transformers import AutoTokenizer
import datetime

time = datetime.datetime.now().strftime('%d%H%M')

def create_submission_file(model_name, test_data, tokenizer, label_encoder, max_length):
    # 토크나이저 타입 확인 및 시퀀스 변환
    if isinstance(tokenizer, AutoTokenizer):
        # Transformers 토크나이저 사용
        test_texts = test_data['text'].tolist()
        X_test = tokenizer.batch_encode_plus(
            test_texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )['input_ids']
    elif isinstance(tokenizer, Tokenizer):
        # Keras Tokenizer 사용
        X_test = tokenizer.texts_to_sequences(test_data['text'].tolist())
        X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
    else:
        raise ValueError("지원되지 않는 토크나이저 타입입니다.")
    
    # 모델 로드
    model = tf.keras.models.load_model(f'Model/{model_name}.h5')
    
    # 예측 생성
    test_predictions = model.predict(X_test)
    test_predicted_labels = tf.argmax(test_predictions, axis=1).numpy()
    
    # 라벨 디코딩
    test_predicted_labels = label_encoder.inverse_transform(test_predicted_labels)
    
    # 서브미션 파일 생성
    submission = pd.DataFrame({'index': test_data.index, 'class': test_predicted_labels})
    submission_filename = f'Output/{time}_{model_name}_{epochs}epochs_submission.csv'
    submission.to_csv(submission_filename, index=False)
    print(f'Submission file created: {submission_filename}')

create_submission_file(model_name, test_data, tokenizer, label_encoder, max_length)

Submission file created: Output/250810_model_bi_20epochs_submission.csv
