In [1]:
!pip install transformers



In [2]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3


In [3]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [4]:
MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=9, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

In [5]:
dataset = pd.read_csv('new.csv', encoding = 'utf-8-sig')

Unnamed: 0.1,Unnamed: 0,category,Keyword,title,topic
0,0,Economy,노선,천안시 무단 방치된 개인형 이동장치 1천23대 견인,1
1,1,Economy,노선,컨콜 포스코홀딩스 CEO 교체돼도 중장기 전략 변함없어,1
2,2,Economy,노선,ESG 평가서 LS ELECTRIC삼성물산 개선세 돋보여 투자360,1
3,3,Economy,노선,52조 작년 국세 급감절반 가까이는 법인세 구멍,1
4,4,Economy,노선,이석희 SK온 흑자 낼 때까지 연봉 20 반납,1


In [6]:
from sklearn.preprocessing import LabelEncoder

In [19]:
category_encoders = {}

# Assuming your DataFrame is named 'dataset'
for category in range(6):
    label_encoder = LabelEncoder()
    unique_keywords = dataset.loc[dataset['category'] == category, 'Keyword'].unique()
    label_encoder.fit(unique_keywords)
    category_encoders[category] = label_encoder

# Create a new column 'encoded_keyword' in the DataFrame
dataset['encoded_keyword'] = None

# Apply label encoding for each category and replace 'None' with the corresponding topic values
for category in range(6):
    mask = dataset['category'] == category
    dataset.loc[mask, 'encoded_keyword'] = category_encoders[category].transform(dataset.loc[mask, 'Keyword'].values) + 1

# Replace 'None' values in 'encoded_keyword' with corresponding 'topic' values
dataset['encoded_keyword'] = dataset['encoded_keyword'].combine_first(dataset['topic'])

In [21]:
X_data = dataset['title']
y_data = dataset['encoded_keyword']

In [22]:
TEST_SIZE = 0.2
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42
)

In [23]:
MAX_SEQ_LEN = 64
def convert_data(X_data, y_data):
    tokens, masks, segments, targets = [], [], [], []

    for X, y in tqdm(zip(X_data, y_data)):
        # token: 입력 문장 토큰화
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)

        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros

        # segment: 문장 전후관계 구분: 오직 한 문장이므로 모두 0으로 초기화
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        targets.append(y)

    # numpy array로 저장
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

In [24]:
train_x, train_y = convert_data(X_train, y_train)
test_x, test_y = convert_data(X_test, y_test)

764it [00:00, 4786.18it/s]
192it [00:00, 4617.37it/s]


In [25]:
# token, mask, segment 입력 정의
token_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_word_ids')
mask_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_masks')
segment_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_segment')
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

In [26]:
bert_output = bert_outputs[0]
DROPOUT_RATE = 0.5
NUM_CLASS = 9
dropout = tf.keras.layers.Dropout(DROPOUT_RATE)(bert_output)


optimized_layer = tf.keras.layers.Dense(NUM_CLASS, activation='softmax', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02))(dropout)
optimized_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], optimized_layer)

In [27]:
OPTIMIZER_NAME = 'RAdam'
LEARNING_RATE = 5e-5
TOTAL_STEPS = 10000
MIN_LR = 1e-5
WARMUP_PROPORTION = 0.1
EPSILON = 1e-8
CLIPNORM = 1.0
optimizer = tfa.optimizers.RectifiedAdam(learning_rate = LEARNING_RATE,
                                          total_steps = TOTAL_STEPS,
                                          warmup_proportion = WARMUP_PROPORTION,
                                          min_lr = MIN_LR,
                                          epsilon = EPSILON,
                                          clipnorm = CLIPNORM)

In [28]:
optimized_model.compile(optimizer = optimizer,
                        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
                        metrics = ['accuracy'])
MIN_DELTA = 1e-3
PATIENCE = 5

early_stopping = EarlyStopping(
    monitor = "val_accuracy",
    min_delta = MIN_DELTA,
    patience = PATIENCE)

In [29]:
BEST_MODEL_NAME = './model/best_model.h5'
model_checkpoint = ModelCheckpoint(
    filepath = BEST_MODEL_NAME,
    monitor = "val_loss",
    mode = "min",
    save_best_only = True,
    verbose = 1
)
callbacks = [early_stopping, model_checkpoint]

In [30]:
train_y = train_y - 1
test_y = test_y - 1

In [31]:
EPOCHS = 100
BATCH_SZIE = 32

optimized_model.fit(train_x, train_y,
                    epochs = EPOCHS,
                    shuffle = True,
                    batch_size = BATCH_SZIE,
                    validation_data = (test_x, test_y),
                    callbacks = callbacks
                    )

Epoch 1/100
Epoch 1: val_loss improved from inf to 2.19802, saving model to ./model/best_model.h5


  saving_api.save_model(


Epoch 2/100
Epoch 2: val_loss improved from 2.19802 to 2.19797, saving model to ./model/best_model.h5
Epoch 3/100
Epoch 3: val_loss improved from 2.19797 to 2.19782, saving model to ./model/best_model.h5
Epoch 4/100
Epoch 4: val_loss improved from 2.19782 to 2.19766, saving model to ./model/best_model.h5
Epoch 5/100
Epoch 5: val_loss improved from 2.19766 to 2.19748, saving model to ./model/best_model.h5
Epoch 6/100
Epoch 6: val_loss improved from 2.19748 to 2.19703, saving model to ./model/best_model.h5
Epoch 7/100
Epoch 7: val_loss improved from 2.19703 to 2.19654, saving model to ./model/best_model.h5
Epoch 8/100
Epoch 8: val_loss improved from 2.19654 to 2.19618, saving model to ./model/best_model.h5
Epoch 9/100
Epoch 9: val_loss improved from 2.19618 to 2.19601, saving model to ./model/best_model.h5
Epoch 10/100
Epoch 10: val_loss improved from 2.19601 to 2.19600, saving model to ./model/best_model.h5
Epoch 11/100
Epoch 11: val_loss improved from 2.19600 to 2.19578, saving model t

<keras.src.callbacks.History at 0x77fe82d5eaa0>

In [32]:
optimized_model_best = tf.keras.models.load_model(BEST_MODEL_NAME,
                                                  custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

In [38]:
import os
from sklearn.metrics import classification_report

metrics_directory = 'metric'
if not os.path.exists(metrics_directory):
    os.makedirs(metrics_directory)

CL_REPORT_FILE = os.path.join(metrics_directory, 'classification_report.csv')

# Save the classification report to CSV
cl_report_df.to_csv(CL_REPORT_FILE)
print(cl_report_df)

              precision  recall  f1-score  support
0                 0.125   0.789     0.216    19.00
1                 0.129   0.148     0.138    27.00
2                 0.158   0.300     0.207    20.00
3                 0.000   0.000     0.000    29.00
4                 0.000   0.000     0.000    18.00
5                 0.000   0.000     0.000    23.00
6                 0.000   0.000     0.000    18.00
7                 0.000   0.000     0.000    14.00
8                 0.000   0.000     0.000    24.00
accuracy          0.130   0.130     0.130     0.13
macro avg         0.046   0.138     0.062   192.00
weighted avg      0.047   0.130     0.062   192.00
