In [18]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

In [2]:
train = pd.read_csv('./storage/writer/train.csv', encoding = 'utf-8') 
test = pd.read_csv('./storage/writer/test_x.csv', encoding = 'utf-8') 
ss = pd.read_csv('./storage/writer/sample_submission.csv', encoding = 'utf-8') 

In [3]:
# 부호 제거하는 함수 
def alpha_num(text): 
    return re.sub(r'[^A-Za-z0-9 ]', '', text) 

train['text'] = train['text'].apply(alpha_num) 

In [4]:
# 불용어 제거하는 함수 
def remove_stopwords(text): 
    final_text = [] 
    for i in text.split(): 
        if i.strip().lower() not in stopwords: 
            final_text.append(i.strip()) 
    return " ".join(final_text)

# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [5]:
# preprocessing 
train['text'] = train['text'].str.lower() 
test['text'] = test['text'].str.lower() 
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords) 
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)  

In [6]:
X_train = np.array([x for x in train['text']]) 
X_test = np.array([x for x in test['text']])            
y_train = np.array([x for x in train['author']])

In [7]:
X_train.shape, y_train.shape, X_test.shape

((54879,), (54879,), (19617,))

In [8]:
vocab_size = 20000
embedding_dim = 16
max_length = 500 
padding_type='post' 

In [9]:
tokenizer = Tokenizer(num_words = vocab_size) 
tokenizer.fit_on_texts(X_train) 
word_idx = tokenizer.word_index

In [10]:
train_sequences = tokenizer.texts_to_sequences(X_train) 
train_padded = pad_sequences(train_sequences, padding = padding_type, maxlen = max_length)

In [11]:
test_sequences = tokenizer.texts_to_sequences(X_test) 
test_padded = pad_sequences(test_sequences, padding = padding_type, maxlen = max_length)

In [12]:
train_padded.shape, test_padded.shape

((54879, 500), (19617, 500))

### construct model and train

In [13]:
def bidirectional_gru(): 
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_dim, input_length = max_length)) 
    model.add(Bidirectional(GRU(150, return_sequences = True))) 
    model.add(Dropout(0.2))
    model.add(GRU(100)) 
    model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
    model.add(Dense(5, activation = 'softmax')) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy']) 
    return model 

In [16]:
def simple_lstm(): 
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_dim, input_length = max_length)) 
    model.add(Dropout(0.5))
    model.add(Conv1D(128, 5, padding = 'valid', activation = 'relu', strides = 3)) 
    model.add(Conv1D(128, 5, padding = 'valid', activation = 'relu', strides = 3)) 
    model.add(MaxPooling1D(pool_size = 4))  
    model.add(LSTM(55)) 
    model.add(Dense(5, activation = 'softmax')) 
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy']) 
    return model 

In [19]:
# conduct KFold Ensemble  
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 777) 
for idx, (train_idx,val_idx) in enumerate(kfold.split(train_padded, y_train)):
    
    print("... Validating on Fold {} ...".format(idx+1))
    
    # split data into train and validation sets 
    cur_x_train, cur_x_val = train_padded[train_idx], train_padded[val_idx] 
    cur_y_train, cur_y_val = y_train[train_idx], y_train[val_idx] 
    
    # build model, define callbacks and train  
    model_path = './storage/writer_trainfiles/kfold' + str(idx+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
    model = simple_lstm() 
    learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 3, verbose = 1, factor = 0.8)
    checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
    history = model.fit(cur_x_train,
                        cur_y_train,
                        validation_data = (cur_x_val,cur_y_val),
                        shuffle = True,
                        epochs = 250,
                        verbose = 1,
                        callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

... Validating on Fold 1 ...
Train on 43903 samples, validate on 10976 samples
Epoch 1/250
Epoch 00001: val_loss improved from inf to 1.21595, saving model to ./storage/writer_trainfiles/kfold1/epoch_001_val_1.216.h5
Epoch 2/250
Epoch 00002: val_loss improved from 1.21595 to 0.95228, saving model to ./storage/writer_trainfiles/kfold1/epoch_002_val_0.952.h5
Epoch 3/250
Epoch 00003: val_loss improved from 0.95228 to 0.87902, saving model to ./storage/writer_trainfiles/kfold1/epoch_003_val_0.879.h5
Epoch 4/250
Epoch 00004: val_loss improved from 0.87902 to 0.83140, saving model to ./storage/writer_trainfiles/kfold1/epoch_004_val_0.831.h5
Epoch 5/250
Epoch 00005: val_loss improved from 0.83140 to 0.80716, saving model to ./storage/writer_trainfiles/kfold1/epoch_005_val_0.807.h5
Epoch 6/250
Epoch 00006: val_loss improved from 0.80716 to 0.79786, saving model to ./storage/writer_trainfiles/kfold1/epoch_006_val_0.798.h5
Epoch 7/250
Epoch 00007: val_loss improved from 0.79786 to 0.77939, savin

### Make predictions

In [20]:
model1 = load_model('./storage/writer_trainfiles/kfold1/epoch_011_val_0.771.h5')
model2 = load_model('./storage/writer_trainfiles/kfold2/epoch_013_val_0.767.h5')
model3 = load_model('./storage/writer_trainfiles/kfold3/epoch_015_val_0.760.h5')
model4 = load_model('./storage/writer_trainfiles/kfold4/epoch_007_val_0.801.h5')
model5 = load_model('./storage/writer_trainfiles/kfold5/epoch_011_val_0.754.h5')

In [23]:
pred1 = model1.predict_proba(test_padded)
pred2 = model2.predict_proba(test_padded) 
pred3 = model3.predict_proba(test_padded) 
pred4 = model4.predict_proba(test_padded) 
pred5 = model5.predict_proba(test_padded) 

In [24]:
pred_avg = (pred1 + pred2 + pred3 + pred4 + pred5)/5.0  

In [26]:
ss[['0','1','2','3','4']] = pred_avg 
ss.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0.00441,0.074918,0.51092,0.407275,0.002477
1,1,0.183829,0.719099,0.032944,0.042305,0.021822
2,2,0.984772,0.012736,0.000375,0.000495,0.001623
3,3,0.00293,0.00171,0.89406,0.003775,0.097524
4,4,0.512074,0.11339,0.130675,0.123092,0.12077


In [27]:
ss.to_csv('./storage/initial_submit.csv', index = False, encoding = 'utf-8') 