# 모델 선택 및 학습
## 모델 선택

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/대학/활동/아이펠/git/jo

/content/drive/MyDrive/대학/활동/아이펠/git/jo


In [43]:
mkdir Output

In [3]:
"""
import os
import tensorflow as tf
# GPU 메모리 사용 점진적 할당 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
"""

import os
import tensorflow as tf


# 토크나이저 관련 경고 무시하기 위하여 설정
os.environ["TOKENIZERS_PARALLELISM"] = 'true'

# device 지정
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is available.


In [4]:
!pip install -qqq seaborn # for evaluation visualization
!pip install -qqq wandb   # for logging
!pip install -qqq datasets # huggingface's lib.
!pip install -qqq transformers==4.39.2
!pip install -qqq accelerate==0.28.0
!pip install -qqq shortuuid

!pip install -U accelerate
!pip install tensorboard

Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.31.0


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
# from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
import torch
import random

# Function to set the seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if you are using multi-GPU.
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    # The below two lines are for deterministic algorithm behavior in CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed()

In [58]:
# 테스트 환경 속 설정 변수
epochs = 20
batch_size = 8
validation_split = 0.2
max_length = 300

In [7]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 파일 경로 설정
train_file_path = 'Data/processed_train2.csv'

train_data = pd.read_csv(train_file_path)

"\n# 텍스트 정제 함수\ndef clean_text(text):\n    text = re.sub(r'\\s+', ' ', text)  # 다중 공백 제거\n    text = re.sub(r'[^\\w\\s]', '', text)  # 특수 문자 제거\n    return text.strip()\n\n\ntrain_data['cleaned_conversation'] = train_data['conversation'].apply(clean_text)\ntrain_data['text_length'] = train_data['cleaned_conversation'].apply(len)\n\n# 토크나이저 설정\ntokenizer = Tokenizer()\ntokenizer.fit_on_texts(train_data['cleaned_conversation'])\n\n# 시퀀스 변환 및 패딩\nsequences = tokenizer.texts_to_sequences(train_data['cleaned_conversation'])\nmax_length = 300\nX_data = pad_sequences(sequences, maxlen=max_length, padding='post')\n\n# 레이블 인코딩\nlabel_encoder = LabelEncoder()\ny_data = label_encoder.fit_transform(train_data['class'])\n\n# 결과 확인\nprint('토크나이저 단어 개수:', len(tokenizer.word_index))\nprint('X_data 크기:', X_data.shape)\nprint('y_data 크기:', y_data.shape)\n"

In [8]:
train_data.head()

Unnamed: 0,conversation,class
0,지금 당장 뉴스 기사 내가 불러준 대로 보도 해 사실 확인이 되지 않은 기사는 낼...,0.0
1,이 버러지 같은 게 너 내가 누군 줄 알아 손님 욕하시면 안 됩니다 어디서 말...,1.0
2,공책 돌려받길 원하면 빨리 뛰어봐 이 굼벵아 빨리 내놔 빨릐 내놓아야 빨리 ...,1.0
3,사장님 저기 말할게 있는데요 뭔데 임마 아니 우리 게임 회사는 전체이용가 게임이잖아...,3.0
4,죽고 싶어서 환장했어 왜 이렇게 말을 안 들어 죄송해요 한 번만 봐주세요 ...,0.0


In [9]:
test_data.head()

Unnamed: 0,text
0,아가씨 담배 한 갑 주소 네 4 500원입니다 어 네 지갑 어디 갔지 에이 버스...
1,우리 팀에서 다른 팀으로 갈 사람 없나 그럼 영지씨가 가는 건 어때 네 제가...
2,너 오늘 그게 뭐야 네 제가 뭘 잘못했나요 제대로 좀 하지 네 똑바로 좀 하지 행...
3,이거 들어봐 와 이 노래 진짜 좋다 그치 요즘 이것만 들어 진짜 너무 좋다 내가 요...
4,아무튼 앞으로 네가 내 와이파이야 응 와이파이 온 켰어 반말 주인님이라고도 ...


## BERT MODEL

In [59]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

model_name = 'bert-base-uncased'

train_texts = train_data['conversation']
test_texts = test_data['text']
labels = train_data['class']

# 레이블 인코딩
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# 학습 데이터와 검증 데이터로 나누기
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, labels_encoded, test_size=0.1, random_state=42)

# datasets 라이브러리를 사용하여 데이터셋 생성
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})
test_dataset = Dataset.from_dict({'text': test_texts})

# 토크나이저 초기화 및 데이터 전처리
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=300)

# 토큰화
train_encodings = tokenize_function(train_dataset)
val_encodings = tokenize_function(val_dataset)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# 데이터셋에 전처리 함수 적용
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Map:   0%|          | 0/4096 [00:00<?, ? examples/s]

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [60]:
model = BertForSequenceClassification\
        .from_pretrained(model_name, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
training_args = TrainingArguments(
    output_dir='./Model',
    num_train_epochs=8,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',

    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    logging_steps=10,
    ## ----
    report_to="tensorboard",
)

In [62]:
# 평가용 함수 정의
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [63]:
# 훈련 루프
# Select the first N samples from the tokenized training dataset
# subset_train_dataset = train_encodings.select(range(6000)) # 1/2 data for time saving

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.349,1.273094,0.394737,0.300402,0.268681,0.394737
2,1.1852,1.187087,0.41886,0.321393,0.512079,0.41886
3,0.8416,0.900535,0.649123,0.634874,0.724377,0.649123
4,0.676,0.646187,0.758772,0.75487,0.768908,0.758772
5,0.6173,0.509125,0.833333,0.830743,0.837938,0.833333
6,0.4677,0.476939,0.839912,0.840377,0.843237,0.839912
7,0.2872,0.540239,0.857456,0.857951,0.860161,0.857456
8,0.1956,0.536737,0.842105,0.842014,0.844054,0.842105


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1024, training_loss=0.7324150792555884, metrics={'train_runtime': 899.8139, 'train_samples_per_second': 36.416, 'train_steps_per_second': 1.138, 'total_flos': 5051868335308800.0, 'train_loss': 0.7324150792555884, 'epoch': 8.0})

In [65]:
import datetime

time = datetime.datetime.now().strftime('%H%M')

# 모델 저장하기
# Specify the directory where you want to save your model
output_dir = f'./Model/{model_name}_{time}/'

# Save the model
model.save_pretrained(output_dir)
# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('./Model/bert-base-uncased_1735/tokenizer_config.json',
 './Model/bert-base-uncased_1735/special_tokens_map.json',
 './Model/bert-base-uncased_1735/vocab.txt',
 './Model/bert-base-uncased_1735/added_tokens.json')

In [66]:
# Evaluate the model
results = trainer.evaluate()

In [67]:
print( results )

{'eval_loss': 0.5367370843887329, 'eval_accuracy': 0.8421052631578947, 'eval_f1': 0.8420135915958454, 'eval_precision': 0.844053643664037, 'eval_recall': 0.8421052631578947, 'eval_runtime': 3.9624, 'eval_samples_per_second': 115.081, 'eval_steps_per_second': 14.385, 'epoch': 8.0}


In [68]:
# Predictions to get the confusion matrix
predictions = trainer.predict(tokenized_datasets['test'])
preds = np.argmax(predictions.predictions, axis=-1)

In [69]:
predictions

# Check the available columns in your test dataset
print(predictions)

PredictionOutput(predictions=array([[ 1.9356124 , -0.9576371 ,  3.3543768 , -2.295366  , -3.107792  ],
       [-1.4816536 , -0.9343721 , -0.8909223 ,  5.5184135 , -1.3941809 ],
       [-1.0034852 , -0.7130526 , -0.6092054 ,  5.3040376 , -2.19336   ],
       ...,
       [-0.666922  , -0.9330128 ,  4.4561467 , -1.7061614 , -1.6794215 ],
       [ 4.3397064 , -0.23564854, -0.67706364, -1.7294174 , -2.4549055 ],
       [ 3.9849417 , -0.8383164 ,  1.0388612 , -2.3464298 , -2.9913821 ]],
      dtype=float32), label_ids=None, metrics={'test_runtime': 4.3907, 'test_samples_per_second': 113.878, 'test_steps_per_second': 14.349})


In [71]:
test_file_path = 'Data/processed_test2.csv'
test_data = pd.read_csv(test_file_path)

inputs = tokenizer(test_dataset['text'], padding=True, truncation=True, max_length=300, return_tensors="pt")

import torch
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Make prediction
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1).cpu().numpy()  # Get the predicted class (index)

# 서브미션 파일 생성
submission = pd.DataFrame({'index': test_data.index, 'class': predictions})
submission_filename = f'Output/{time}_{model_name}_{epochs}epochs_submission.csv'
submission.to_csv(submission_filename, index=False)
print(f'Submission file created: {submission_filename}')

Submission file created: Output/1735_bert-base-uncased_20epochs_submission.csv
