In [26]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm
from transformers import TFAutoModel, AutoTokenizer, BertTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings(action='ignore')

In [28]:
file_path = '/kaggle/input/sentence/'
file_name = 'train.csv'
df = pd.read_csv(file_path + file_name)
df_test = pd.read_csv(file_path + 'test.csv')
df.head()

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,TRAIN_00002,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실
3,TRAIN_00003,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,TRAIN_00004,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실


In [29]:
# 한글, 영어, 숫자만 남기고 제거
df['문장'] = df['문장'].apply(lambda x: re.sub('[^A-Za-z0-9가-힣\s]', '', x))

In [30]:
# Easy Data Augmentation
# 참고: https://github.com/catSirup/KorEDA/blob/master/eda.py
import random

def random_swap(words, n=3):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)

    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0

    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words

    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words


def augment_data(sentence, alpha_rs=0.1, num_aug=9):
    words = sentence.split(' ')
    words = [word for word in words if word is not ""]
    num_words = len(words)

    augmented_sentences = []
    num_new_per_technique = int(num_aug/4) + 1

    n_rs = max(1, int(alpha_rs*num_words))

    # rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(" ".join(a_words))

    augmented_sentences = [sentence for sentence in augmented_sentences]
    random.shuffle(augmented_sentences)

    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
        
    return augmented_sentences

In [31]:
augmentation = df['문장'].apply(lambda x:augment_data(x))
augmentation[0]

['075포인트 금리 만에 1994년 이후 28년 인상은 처음이다',
 '075포인트 1994년 인상은 금리 이후 28년 만에 처음이다',
 '075포인트 금리 인상은 1994년 이후 만에 28년 처음이다']

In [32]:
df_temp = df.copy()
for i in range(3):
    temp = df.copy()
    temp['문장'] = list(map(lambda x: x[i], augmentation))
    df_temp = df_temp.append(temp)

In [33]:
train = df_temp.drop_duplicates(keep='first').sample(frac=1).reset_index(drop=True)
train

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_06645,14년간 공화국에서 근무한 그는 당연히 공화주의자였다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
1,TRAIN_05640,갈라버스에 참여한 한국 블록체인 모두 NFT와 P2E라는 게임들은 기반 기술을 적용했다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
2,TRAIN_11251,3회 이상 음주운전으로 적발된 사람도 7만4913명이나 됐다,사실형,긍정,과거,확실,사실형-긍정-과거-확실
3,TRAIN_15779,눈 속을 뒤지며 사흘 동안 동생을 찾아 헤매다 수색팀에 사라졌고 동생은 영영 구조됐...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
4,TRAIN_13344,작품 선정 기준은 국내 트위터 실시간 총 트렌드의 노출organic impressi...,사실형,긍정,현재,확실,사실형-긍정-현재-확실
...,...,...,...,...,...,...,...
64763,TRAIN_10799,그는 임금에 대한 충성이 여러 신하 중에서도 각별했다,추론형,긍정,과거,확실,추론형-긍정-과거-확실
64764,TRAIN_07491,부영 측은 별다른 입장이 없다고 했지만 용산구 실시계획인가를 두고 제2의 소송을 제...,추론형,부정,현재,불확실,추론형-부정-현재-불확실
64765,TRAIN_10484,추천위 신설을 놓고 평가가 갈린다,추론형,긍정,현재,확실,추론형-긍정-현재-확실
64766,TRAIN_12113,또 선 자세는 골반과 척추 기립근을 잡아 곧게 자세를 주어 유지하고 디스크 발생 가...,사실형,긍정,미래,확실,사실형-긍정-미래-확실


In [34]:
# Tokenizer 정의
model_ckpt = 'snunlp/KR-Medium'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [35]:
# Dataset 생성
MAX_LEN = 200

def bert_tokenizer(sent, max_len):
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True,      
        max_length = max_len,           
        pad_to_max_length = True,
        return_attention_mask = True,   
        truncation = True
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

# Train data 생성
def build_data(doc, max_len):
    x_ids = []
    x_msk = []
    x_typ = []

    for sent in tqdm(doc):
        input_id, attention_mask, token_type_id = bert_tokenizer(sent, max_len)
        x_ids.append(input_id)
        x_msk.append(attention_mask)
        x_typ.append(token_type_id)

    x_ids = np.array(x_ids, dtype=int)
    x_msk = np.array(x_msk, dtype=int)
    x_typ = np.array(x_typ, dtype=int)

    return x_ids, x_msk, x_typ

In [36]:
# Input data: [x_ids, x_msk, x_typ]
data = build_data(train['문장'], MAX_LEN)

# Label: [[type_label], [polar_label], [tense_label], [certain_label]] 
type_ohe = OneHotEncoder()
types = type_ohe.fit_transform(train['유형'].values.reshape(-1, 1)).toarray()

polar_ohe = OneHotEncoder()
polars = polar_ohe.fit_transform(train['극성'].values.reshape(-1, 1)).toarray()

tense_ohe = OneHotEncoder()
tenses = tense_ohe.fit_transform(train['시제'].values.reshape(-1, 1)).toarray()

certain_ohe = OneHotEncoder()
certains = certain_ohe.fit_transform(train['확실성'].values.reshape(-1, 1)).toarray()

label = [types, polars, tenses, certains]

100%|██████████| 64768/64768 [00:13<00:00, 4723.54it/s]


In [37]:
from transformers import TFBertModel
bert = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
# bert = TFBertModel.from_pretrained(check_point, from_pt=True)
bert.trainable = True

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.position_ids', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [38]:
# Custom model define
class CustomModel(tf.keras.Model):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert_layer = bert
        self.type_out = tf.keras.Sequential(
            [Dropout(0.2),
             Dense(256, activation='relu'),
             Dense(4, activation='softmax')]
        )
        self.polar_out = tf.keras.Sequential(
            [Dropout(0.2),
             Dense(256, activation='relu'),
             Dense(3, activation='softmax')]
        )
        self.tense_out = tf.keras.Sequential(
            [Dropout(0.2),
             Dense(256, activation='relu'),
             Dense(3, activation='softmax')]
        )
        self.certain_out = tf.keras.Sequential(
            [Dropout(0.2),
             Dense(256, activation='relu'),
             Dense(2, activation='softmax')]
        )
        
    def call(self, inputs):
        bert_output = self.bert_layer(inputs)[1]
        type_output = self.type_out(bert_output)
        polar_output = self.polar_out(bert_output)
        tense_output = self.tense_out(bert_output)
        certain_output = self.certain_out(bert_output)
        
        # output shape: [[1, 0, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0]]
        return type_output, polar_output, tense_output, certain_output


def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-8, 1-1e-8)
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -tf.reduce_sum(alpha * tf.pow(1. - pt_1, gamma) * tf.math.log(pt_1)) \
               -tf.reduce_sum((1 - alpha) * tf.pow(pt_0, gamma) * tf.math.log(1. - pt_0))
    return focal_loss_fixed


def asymmetric_loss(theta=0.5, gamma_neg=4, gamma_pos=1):
    def asymmetric_loss_fixed(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-8, 1 - 1e-8)
        p_t = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        theta_t = tf.where(tf.equal(y_true, 1), theta * tf.ones_like(y_true), (1 - theta) * tf.ones_like(y_true))
        gamma_t = tf.where(tf.equal(y_true, 1), gamma_pos * tf.ones_like(y_true), gamma_neg * tf.ones_like(y_true))
        return -tf.reduce_sum(theta_t * tf.pow(1. - p_t, gamma_t) * tf.math.log(p_t))
    return asymmetric_loss_fixed


model = CustomModel()
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
#               loss=[focal_loss(gamma=2., alpha=0.25), 
#                     focal_loss(gamma=2., alpha=0.25), 
#                     focal_loss(gamma=2., alpha=0.25), 
#                     focal_loss(gamma=2., alpha=0.25)])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=[asymmetric_loss(theta=0.5, gamma_neg=4, gamma_pos=1), 
                    asymmetric_loss(theta=0.5, gamma_neg=4, gamma_pos=1), 
                    asymmetric_loss(theta=0.5, gamma_neg=4, gamma_pos=1), 
                    asymmetric_loss(theta=0.5, gamma_neg=4, gamma_pos=1)])

In [39]:
# Train the model
history = model.fit(data, label, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [44]:
model.fit(data, label, epochs=1, batch_size=32)



<keras.callbacks.History at 0x7f9be9947f90>

In [45]:
model.fit(data, label, epochs=1, batch_size=32)



<keras.callbacks.History at 0x7f9bea73bf10>

## Submission

In [46]:
test_doc = df_test['문장']
x_test_ids, x_test_msk, x_test_typ = build_data(test_doc, MAX_LEN)
x_test = [x_test_ids, x_test_msk, x_test_typ]

ans_type = {0: '사실형', 1: '추론형', 2: '대화형', 3: '예측형'}
ans_polar = {0: '긍정', 1: '부정', 2: '미정'}
ans_tense = {0: '과거', 1: '현재', 2: '미래'}
ans_certain = {0: '확실', 1: '불확실'}

100%|██████████| 7090/7090 [00:01<00:00, 5177.64it/s]


In [47]:
prediction = model.predict(x_test)



In [48]:
submit = []
for pred in zip(*prediction):
    tp = type_ohe.inverse_transform(pred[0].reshape(1, -1))
    pl = polar_ohe.inverse_transform(pred[1].reshape(1, -1))
    tn = tense_ohe.inverse_transform(pred[2].reshape(1, -1))
    ct = certain_ohe.inverse_transform(pred[3].reshape(1, -1))
    
    tp = tp[0][0]
    pl = pl[0][0]
    tn = tn[0][0]
    ct = ct[0][0]
    
    submit.append(f"{tp}-{pl}-{tn}-{ct}")

submit[:5]

df_submit = df_test
df_submit['label'] = submit
df_submit = df_submit.drop(['문장'], axis=1)
df_submit.head()

# df_submit.to_csv('submit_FocalLoss.csv', index=False)

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실


In [49]:
df_submit.to_csv('submit_AsymmetricLoss_epochs8.csv', index=False)