# 소설 작가 분류 AI 경진대회

## 0. Info

* Type : Text Classification
* URL : https://dacon.io/competitions/official/235670/data/
* score
    * train (acc) : 0.7082
    * val (acc) : 0.7396
    * test (lb) : 0.49011


## 1. Setting

In [None]:
!pip install -q transformers

In [66]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_addons as tfa

from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel

In [2]:
|SEED = 1

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [42]:
BATCH_SIZE = 32
BASE_DIR = 'drive/Shared drives/Yoon/Project/Doing/Data Science Competition/Dacon/Writer Classification'

## 2. Data

In [None]:
data_path = os.path.join(BASE_DIR, 'data.zip')
!unzip "{data_path}" -d "data"

In [10]:
train_data = pd.read_csv('data/train.csv', index_col=0)
train_data.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [60]:
split_idx = int(len(train_data) * 0.1)
random_idx = np.random.permutation(len(train_data))

train_idx = random_idx[split_idx:]
val_idx = random_idx[:split_idx]

val_data = train_data.iloc[val_idx]
train_data = train_data.iloc[train_idx]

In [72]:
class Dataloader(tf.keras.utils.Sequence):
    def __init__(self, data, tokenizer, mode):
        self.data = data
        self.tokenizer = tokenizer
        self.mode = mode

        self.on_epoch_end()
    
    def __len__(self):
        return np.ceil(len(self.data) / BATCH_SIZE).astype(np.int32)

    def on_epoch_end(self):
        if self.mode == 'train':
            self.indices = np.random.permutation(len(self.data))
        else:
            self.indices = np.arange(len(self.data))
    
    def __getitem__(self, idx):
        batch_idx = self.indices[BATCH_SIZE*idx : BATCH_SIZE*(idx+1)]
        batch_data = self.data.iloc[batch_idx]
        batch_x = batch_data['text']
        batch_x = [self.tokenizer.encode_plus(x, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True) for x in batch_x]

        input_ids = np.array([i['input_ids'] for i in batch_x]).astype(np.int32)
        input_masks = np.array([i['attention_mask'] for i in batch_x]).astype(np.int32)
        input_seg = np.array([i['token_type_ids'] for i in batch_x]).astype(np.int32)

        if self.mode == 'test':
            y = None
        else:
            y = batch_data['author'].values.astype(np.int32)
        return (input_ids, input_masks, input_seg), y


In [62]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_loader = Dataloader(train_data, tokenizer, 'train')
val_loader = Dataloader(val_data, tokenizer, 'val')

In [63]:
x, y = train_loader.__getitem__(0)
x[0].shape, x[1].shape, x[2].shape, y.shape

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


((32, 128), (32, 128), (32, 128), (32,))

## 3. Model

In [100]:
def build_model():
    config = DistilBertConfig(dropout=0.2)
    config.output_hidden_states = False
    transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

    input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
    input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 

    embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
    cls_token = embedding_layer[:,0,:]
    X = tf.keras.layers.BatchNormalization()(cls_token)
    X = tf.keras.layers.Dense(192, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(5, activation='softmax')(X)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

    for layer in model.layers[:3]:
        layer.trainable = False

    model.compile(
        loss = 'sparse_categorical_crossentropy',
        metrics = ['acc'],
        optimizer = 'adam'
    )
    return model

In [101]:
model = build_model()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [102]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti ((None, 128, 768),)  66362880    input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 768)]        0           tf_distil_bert_model_1

## 4. Train

In [103]:
model.fit(
    train_loader,
    validation_data = val_loader,
    batch_size = BATCH_SIZE,
    epochs = 10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbd6f214be0>

## 5. Test

In [104]:
test_data = pd.read_csv('data/test_x.csv')
test_data.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [105]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0


In [106]:
test_loader = Dataloader(test_data, tokenizer, 'test')
test_pred = model.predict(test_loader)
submission[['0', '1', '2', '3', '4']] = test_pred



In [107]:
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0.009136,0.676123,0.286625,0.027784,0.000333
1,1,0.172457,0.433576,0.024461,0.339175,0.030331
2,2,0.88326,0.06269,0.000735,0.022364,0.030951
3,3,0.008899,0.0028,0.944982,0.041356,0.001962
4,4,0.905861,0.000539,0.07642,0.009088,0.008091
