In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings(action='ignore')
import seaborn as sns

#preprocess
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from collections import  Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

#wordcloud
from wordcloud import WordCloud, STOPWORDS

#stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))

#embedding
import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#modeling
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, BatchNormalization, GlobalAveragePooling1D, AveragePooling1D, Bidirectional, LSTM, SimpleRNN
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model, to_categorical

# col 생략 없이 출력
pd.set_option('display.max_columns', None)
# col 최대 너비 200
pd.set_option('max_colwidth', 200)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

No GPU detected


In [4]:
train = pd.read_csv('/content/drive/MyDrive/clean_train.csv',index_col=0)
test = pd.read_csv('/content/drive/MyDrive/clean_test.csv',index_col=0)
train.head()

Unnamed: 0_level_0,clean_text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,he was almost choking there was so much so much he wanted to say but strange exclamations were all that came from his lips the pole gazed fixedly at him at the bundle of notes in his hand looked a...,3
1,your sister asked for it i suppose,2
2,she was engaged one day as she walked in perusing janes last letter and dwelling on some passages which proved that jane had not written in spirits when instead of being again surprised by mr odin...,1
3,the captain was in the porch keeping himself carefully out of the way of a treacherous shot should any be intended he turned and spoke to us doctors watch on the lookout dr odin take the north sid...,4
4,have mercy gentlemen odin flung up his hands dont write that anyway have some shame here ive torn my heart asunder before you and you seize the opportunity and are fingering the wounds in both hal...,3


In [5]:
X_train = np.array([x for x in train['clean_text']])
X_test = np.array([x for x in test['clean_text']])
Y_train = np.array([x for x in train['author']])

In [6]:
train['clean_text'].str.len().describe()

count    54835.000000
mean       218.017744
std        268.786526
min          9.000000
25%         58.000000
50%        111.000000
75%        254.000000
max       2442.000000
Name: clean_text, dtype: float64

## 파라미터 설정 정리
vocab_size(v) max_length(m) embedding_dim(e) batch(b) param(p) Accuracy(a)
1. 2000 150 200 100 434,000 1D-RNN 0.26
2. 2000 150 200 100 534,200 3D-LSTM 0.27
3. 2000 150 200 100 733,950 3D-BiLSTM+cv 0.97
4. 50000 256 300 128 ----------- 1D-CNN 0.71
5. 20000 200 64 256 1,282,245 1D-CNN +1D 0.75
6. 50000 256 20 16 9,538,260 average 0.83
7. 50000 150 100 64 ------------- 2D-CNN + 1D 0.78
8. 20000 500 64 512 -------------  2D-CNN+1D+cv loss:0.9
9. 20000 500 64 512 1,445,509 2D-BiLSTM + cv 0.73



In [7]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 300
max_length = 300
padding_type='post'

#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [8]:
print(train_padded.shape)

(54879, 300)


In [9]:
from sklearn.model_selection import StratifiedKFold
n_fold = 3
n_class = 5
seed = 42

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [10]:
def modeling():
    model = Sequential()
    model.add(Embedding(vocab_size, 300, input_length=max_length))
    model.add(GlobalAveragePooling1D())
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(5, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    return(model)

# model summary
print(modeling().summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          6000000   
                                                                 
 global_average_pooling1d (G  (None, 300)              0         
 lobalAveragePooling1D)                                          
                                                                 
 batch_normalization (BatchN  (None, 300)              1200      
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 64)                19264     
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 6,020,789
Trainable params: 6,020,189
Non-

In [11]:
p_val = np.zeros((train_padded.shape[0], n_class))
p_tst = np.zeros((test_padded.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(train_padded, Y_train), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf = modeling()    
    clf.fit(train_padded[i_trn], 
            to_categorical(Y_train[i_trn]),
            validation_data=(train_padded[i_val], to_categorical(Y_train[i_val])),
            epochs=20,
            batch_size=64,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(train_padded[i_val])
    p_tst += clf.predict(test_padded) / n_fold

training model for CV #1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 10: early stopping
training model for CV #2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 8: early stopping
training model for CV #3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: early stopping


In [12]:
from sklearn.metrics import accuracy_score, log_loss

print(f'Accuracy (CV): {accuracy_score(Y_train, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(Y_train), p_val):8.4f}')

Accuracy (CV):  69.1394%
Log Loss (CV):   0.9026


In [13]:
def BiLSTM():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Bidirectional(LSTM(64,return_sequences= True)))
    model.add(Bidirectional(LSTM(64,return_sequences= True)))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(5, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    return(model)

# model summary
print(BiLSTM().summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 300, 100)          2000000   
                                                                 
 bidirectional (Bidirectiona  (None, 300, 128)         84480     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 300, 128)         98816     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 5)                 325       
                                                      

In [None]:
p_val = np.zeros((train_padded.shape[0], n_class))
p_tst = np.zeros((test_padded.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(train_padded, Y_train), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='acc', min_delta=0.01, patience=5,
                       verbose=1, mode='max', baseline=None, restore_best_weights=True)

    clf = BiLSTM()    
    clf.fit(train_padded[i_trn], 
            to_categorical(Y_train[i_trn]),
            validation_data=(train_padded[i_val], to_categorical(Y_train[i_val])),
            epochs=20,
            batch_size=64,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(train_padded[i_val])
    p_tst += clf.predict(test_padded) / n_fold

training model for CV #1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
 29/572 [>.............................] - ETA: 15:23 - loss: 0.1664 - acc: 0.9477

In [None]:
print(f'Accuracy (CV): {accuracy_score(Y_train, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(Y_train), p_val):8.4f}')

In [None]:
# 모델 학습 결과 확인
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.title('loss of Bidirectional LSTM (model3) ', fontsize= 15)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'],'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('accuracy of Bidirectional LSTM (model3) ', fontsize= 15)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'],'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show