# 모델 선택 및 학습
## 모델 선택

In [1]:
"""
import os
import tensorflow as tf
# GPU 메모리 사용 점진적 할당 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
"""

import os
import tensorflow as tf


# 토크나이저 관련 경고 무시하기 위하여 설정
os.environ["TOKENIZERS_PARALLELISM"] = 'true'

# device 지정
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is available.


In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.3-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
     |████████████████████████████████| 6.9 MB 5.7 MB/s            
Collecting setproctitle
  Downloading setproctitle-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-2.6.0-py2.py3-none-any.whl (296 kB)
     |████████████████████████████████| 296 kB 66.1 MB/s            
[?25hCollecting gitpython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
     |████████████████████████████████| 207 kB 66.4 MB/s            
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)
     |████████████████████████████████| 62 kB 2.0 MB/s             
Collecting urllib3<1.27,>=1.21.1
  Downloading url

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
import torch
import random

# Function to set the seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if you are using multi-GPU.
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    # The below two lines are for deterministic algorithm behavior in CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed()

In [5]:
# 테스트 환경 속 설정 변수
epochs = 20
batch_size = 8
validation_split = 0.2

In [10]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 파일 경로 설정
train_file_path = 'Data/processed_train2.csv'
test_file_path = 'Data/processed_test2.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

"""
# 텍스트 정제 함수
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # 다중 공백 제거
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    return text.strip()


train_data['cleaned_conversation'] = train_data['conversation'].apply(clean_text)
train_data['text_length'] = train_data['cleaned_conversation'].apply(len)

# 토크나이저 설정
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['cleaned_conversation'])

# 시퀀스 변환 및 패딩
sequences = tokenizer.texts_to_sequences(train_data['cleaned_conversation'])
max_length = 300
X_data = pad_sequences(sequences, maxlen=max_length, padding='post')

# 레이블 인코딩
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(train_data['class'])

# 결과 확인
print('토크나이저 단어 개수:', len(tokenizer.word_index))
print('X_data 크기:', X_data.shape)
print('y_data 크기:', y_data.shape)
"""

"\n# 텍스트 정제 함수\ndef clean_text(text):\n    text = re.sub(r'\\s+', ' ', text)  # 다중 공백 제거\n    text = re.sub(r'[^\\w\\s]', '', text)  # 특수 문자 제거\n    return text.strip()\n\n\ntrain_data['cleaned_conversation'] = train_data['conversation'].apply(clean_text)\ntrain_data['text_length'] = train_data['cleaned_conversation'].apply(len)\n\n# 토크나이저 설정\ntokenizer = Tokenizer()\ntokenizer.fit_on_texts(train_data['cleaned_conversation'])\n\n# 시퀀스 변환 및 패딩\nsequences = tokenizer.texts_to_sequences(train_data['cleaned_conversation'])\nmax_length = 300\nX_data = pad_sequences(sequences, maxlen=max_length, padding='post')\n\n# 레이블 인코딩\nlabel_encoder = LabelEncoder()\ny_data = label_encoder.fit_transform(train_data['class'])\n\n# 결과 확인\nprint('토크나이저 단어 개수:', len(tokenizer.word_index))\nprint('X_data 크기:', X_data.shape)\nprint('y_data 크기:', y_data.shape)\n"

In [11]:
"""
padded_sequences = X_data
labels = y_data
"""
train_data.head()

Unnamed: 0,conversation,class
0,지금 당장 뉴스 기사 내가 불러준 대로 보도 해 사실 확인이 되지 않은 기사는 낼...,0.0
1,이 버러지 같은 게 너 내가 누군 줄 알아 손님 욕하시면 안 됩니다 어디서 말...,1.0
2,공책 돌려받길 원하면 빨리 뛰어봐 이 굼벵아 빨리 내놔 빨릐 내놓아야 빨리 ...,1.0
3,사장님 저기 말할게 있는데요 뭔데 임마 아니 우리 게임 회사는 전체이용가 게임이잖아...,3.0
4,죽고 싶어서 환장했어 왜 이렇게 말을 안 들어 죄송해요 한 번만 봐주세요 ...,0.0


In [13]:
test_data.head()

Unnamed: 0,text
0,아가씨 담배 한 갑 주소 네 4 500원입니다 어 네 지갑 어디 갔지 에이 버스...
1,우리 팀에서 다른 팀으로 갈 사람 없나 그럼 영지씨가 가는 건 어때 네 제가...
2,너 오늘 그게 뭐야 네 제가 뭘 잘못했나요 제대로 좀 하지 네 똑바로 좀 하지 행...
3,이거 들어봐 와 이 노래 진짜 좋다 그치 요즘 이것만 들어 진짜 너무 좋다 내가 요...
4,아무튼 앞으로 네가 내 와이파이야 응 와이파이 온 켰어 반말 주인님이라고도 ...


## BERT MODEL

In [29]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

train_texts = train_data['conversation']
test_texts = test_data['text']
labels = train_data['class']

# 레이블 인코딩
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# 학습 데이터와 검증 데이터로 나누기
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, labels_encoded, test_size=0.1, random_state=42)

# datasets 라이브러리를 사용하여 데이터셋 생성
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})
test_dataset = Dataset.from_dict({'text': test_texts})

# 토큰화
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# 토크나이저 초기화 및 데이터 전처리
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)


# 데이터셋에 전처리 함수 적용
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /aiffel/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /aiffel/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.76ea01b4b85ac16e2cec55c398cba7a943d89ab21dfdd973f6630a152e4b9aed
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /aiffel/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535c

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
model = BertForSequenceClassification\
        .from_pretrained('bert-base-uncased', num_labels=5)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embeddi

In [31]:
training_args = TrainingArguments(
    output_dir='./Model',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',

    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    logging_steps=10,
    ## ----
    report_to="tensorboard",
)

PyTorch: setting up devices


In [32]:
# 평가용 함수 정의
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [37]:
# 훈련 루프
# Select the first N samples from the tokenized training dataset
# subset_train_dataset = train_encodings.select(range(6000)) # 1/2 data for time saving

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 4096
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 384


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2709,1.188664,0.416667,0.344141,0.605407,0.416667


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 456
  Batch size = 8


In [None]:
# 모델 저장하기
# Specify the directory where you want to save your model
output_dir = './bert-topic-cls'

# Save the model
model.save_pretrained(output_dir)
# Save the tokenizer
tokenizer.save_pretrained(output_dir)

In [None]:
# Evaluate the model
results = trainer.evaluate()

In [None]:
print( results )

In [None]:
# Predictions to get the confusion matrix
predictions = trainer.predict(tokenized_datasets['test'])
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

label_map = {
    'LABEL_0': 'A',
    'LABEL_1': 'B',
    'LABEL_2': 'C',
    'LABEL_3': 'D'
    'LABEL_4': 'E'
}

cm = confusion_matrix(predictions.label_ids, preds)

# label_map to labels
labels = [label_map[f'LABEL_{i}'] for i in range(len(label_map))]

# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix with Label Names')
plt.show()

In [None]:
model_name = 'robera_base'
model.save(f'Model/{model_name}.h5')

In [None]:
from transformers import AutoTokenizer
import datetime

time = datetime.datetime.now().strftime('%d%H%M')

def create_submission_file(model_name, test_data, tokenizer, label_encoder, max_length):
    # 토크나이저 타입 확인 및 시퀀스 변환
    if isinstance(tokenizer, AutoTokenizer):
        # Transformers 토크나이저 사용
        test_texts = test_data['text'].tolist()
        X_test = tokenizer.batch_encode_plus(
            test_texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )['input_ids']
    elif isinstance(tokenizer, Tokenizer):
        # Keras Tokenizer 사용
        X_test = tokenizer.texts_to_sequences(test_data['text'].tolist())
        X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
    else:
        raise ValueError("지원되지 않는 토크나이저 타입입니다.")
    
    # 모델 로드
    model = tf.keras.models.load_model(f'Model/{model_name}.h5')
    
    # 예측 생성
    test_predictions = model.predict(X_test)
    test_predicted_labels = tf.argmax(test_predictions, axis=1).numpy()
    
    # 라벨 디코딩
    test_predicted_labels = label_encoder.inverse_transform(test_predicted_labels)
    
    # 서브미션 파일 생성
    submission = pd.DataFrame({'index': test_data.index, 'class': test_predicted_labels})
    submission_filename = f'Output/{time}_{model_name}_{epochs}epochs_submission.csv'
    submission.to_csv(submission_filename, index=False)
    print(f'Submission file created: {submission_filename}')

create_submission_file(model_name, test_data, tokenizer, label_encoder, max_length)