In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
#!pip install transformers



In [None]:
#!pip install tensorflow_addons



In [None]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss

In [None]:
MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/SC/data/final_data.csv')

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  2400 non-null   object
 1   emotion  2400 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 37.6+ KB


In [None]:
X_data = dataset['content']
y_data = dataset['emotion']

In [None]:
TEST_SIZE = 0.2
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size = TEST_SIZE,
                                                    random_state = RANDOM_STATE,
                                                    stratify = y_data)

In [None]:
print(f"훈련 입력 데이터 개수: {len(X_train)}")
print(f"테스트 입력 데이터 개수: {len(X_test)}")

훈련 입력 데이터 개수: 1920
테스트 입력 데이터 개수: 480


In [None]:
y_train.value_counts(normalize = True)

3    0.166667
2    0.166667
5    0.166667
0    0.166667
4    0.166667
1    0.166667
Name: emotion, dtype: float64

In [None]:
y_test.value_counts(normalize = True)

4    0.166667
5    0.166667
2    0.166667
3    0.166667
0    0.166667
1    0.166667
Name: emotion, dtype: float64

In [None]:
MAX_SEQ_LEN = 64

In [None]:
def convert_data(X_data, y_data):
    tokens, masks, segments, targets = [], [], [], []

    for X, y in tqdm(zip(X_data, y_data)):
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)

        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros

        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        targets.append(y)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

In [None]:
train_x, train_y = convert_data(X_train, y_train)

1920it [00:00, 5572.85it/s]


In [None]:
test_x, test_y = convert_data(X_test, y_test)

480it [00:00, 5755.26it/s]


In [None]:
token_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_word_ids')
mask_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_masks')
segment_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_segment')
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

In [None]:
bert_outputs

TFSequenceClassifierOutput(loss=None, logits=<KerasTensor: shape=(None, 6) dtype=float32 (created by layer 'tf_bert_for_sequence_classification_1')>, hidden_states=None, attentions=None)

In [None]:
bert_output = bert_outputs[0]

In [None]:
DROPOUT_RATE = 0.5
NUM_CLASS = 6
dropout = tf.keras.layers.Dropout(DROPOUT_RATE)(bert_output)

sentiment_layer = tf.keras.layers.Dense(NUM_CLASS, activation='softmax', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02))(dropout)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_layer)

In [None]:
OPTIMIZER_NAME = 'RAdam'
LEARNING_RATE = 5e-5
TOTAL_STEPS = 10000
MIN_LR = 1e-5
WARMUP_PROPORTION = 0.1
EPSILON = 1e-8
CLIPNORM = 1.0
optimizer = tfa.optimizers.RectifiedAdam(learning_rate = LEARNING_RATE,
                                          total_steps = TOTAL_STEPS,
                                          warmup_proportion = WARMUP_PROPORTION,
                                          min_lr = MIN_LR,
                                          epsilon = EPSILON,
                                          clipnorm = CLIPNORM)

sentiment_model.compile(optimizer = optimizer,
                        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
                        metrics = ['accuracy'])

In [None]:
MIN_DELTA = 1e-3
PATIENCE = 5

early_stopping = EarlyStopping(
    monitor = "val_accuracy",
    min_delta = MIN_DELTA,
    patience = PATIENCE)

In [None]:
cd /content/drive/MyDrive/SC

/content/drive/MyDrive/SC


In [None]:
BEST_MODEL_NAME = './model/best_model.h5'

In [None]:
model_checkpoint = ModelCheckpoint(
    filepath = BEST_MODEL_NAME,
    monitor = "val_loss",
    mode = "min",
    save_best_only = True,
    verbose = 1
)

In [None]:
callbacks = [early_stopping, model_checkpoint]

In [None]:
EPOCHS = 100
BATCH_SZIE = 32

In [None]:
sentiment_model.fit(train_x, train_y,
                    epochs = EPOCHS,
                    shuffle = True,
                    batch_size = BATCH_SZIE,
                    validation_data = (test_x, test_y),
                    callbacks = callbacks
                    )

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69194, saving model to ./model/best_model.h5


  saving_api.save_model(


Epoch 2/100
Epoch 2: val_loss improved from 0.69194 to 0.61319, saving model to ./model/best_model.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.61319 to 0.54192, saving model to ./model/best_model.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.54192 to 0.47851, saving model to ./model/best_model.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.47851 to 0.42198, saving model to ./model/best_model.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.42198 to 0.37241, saving model to ./model/best_model.h5


<keras.src.callbacks.History at 0x7dc030503310>

### 예측값 계산

In [None]:
sentiment_model_best = tf.keras.models.load_model(BEST_MODEL_NAME,
                                                  custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

In [None]:
predicted_value = sentiment_model_best.predict(test_x)
predicted_label = np.argmax(predicted_value, axis = 1)