In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
train = pd.read_csv('./storage/writer/train.csv', encoding = 'utf-8') 
test = pd.read_csv('./storage/writer/test_x.csv', encoding = 'utf-8') 
ss = pd.read_csv('./storage/writer/sample_submission.csv', encoding = 'utf-8') 

In [3]:
# 부호 제거하는 함수 
def alpha_num(text): 
    return re.sub(r'[^A-Za-z0-9 ]', '', text) 


In [4]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [5]:
# 불용어 제거하는 함수 
# we believe this is not necessary 
stopwords_list = stopwords.words('english')
def remove_stopwords(text): 
    final_text = [] 
    for i in text.split(): 
        if i.strip().lower() not in stopwords_list: 
            final_text.append(i.strip()) 
    return " ".join(final_text)


In [6]:
# preprocessing 
train['text'] = train['text'].str.lower() 
test['text'] = test['text'].str.lower() 
train['text'] = train['text'].apply(alpha_num)
test['text'] = test['text'].apply(alpha_num)

In [7]:
X_train = np.array([x for x in train['text']]) 
X_test = np.array([x for x in test['text']])            
y_train = np.array([x for x in train['author']])

In [8]:
X_train.shape, y_train.shape, X_test.shape

((54879,), (54879,), (19617,))

### Train Word2Vec model on our data 

In [None]:
word_model = gensim.models.Word2Vec(texts, size = 300, min_count = 1, iter = 10)

For the above code snippet, size is the dimension that the Word2Vec vectors will have. min_count is how many encounters are required to add the word in our vocabulary. iter is how many epochs the Word2Vec model should use to learn the semantic correlations (10 in our case) 

In [None]:
embedding_matrix = np.zeros((len(word_model.wv.vocab) + 1, 300)) 
for i, vec in enumerate(word_model.wv.vectors): 
    embedding_matrix[i] = vec 

### Tokenize our text 

In [9]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(X_train) 
word_idx = tokenizer.word_index

In [10]:
features = len(word_idx) + 1 
embedding_dim = 16
padding_type='post' 

In [12]:
train_sequences = tokenizer.texts_to_sequences(X_train)

In [13]:
num_tokens = [len(tokens) for tokens in train_sequences]
num_tokens = np.asarray(num_tokens)
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens) 
max_tokens = int(max_tokens) 

train_padded = pad_sequences(train_sequences, maxlen = max_tokens, padding = padding_type, truncating = padding_type)

In [14]:
test_sequences = tokenizer.texts_to_sequences(X_test) 
test_padded = pad_sequences(test_sequences, maxlen = max_tokens, padding = padding_type, truncating = padding_type)

In [15]:
train_padded.shape, test_padded.shape

((54879, 143), (19617, 143))

### Try loading Glove embeddings

In [None]:
embeddings_index = dict() 
f = open('./storage/glove.6B/glove.6B.300d.txt')
for line in f:  
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs 
f.close() 
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
vocabs = len(tokenizer.word_index.items())+ 1 
embedding_matrix = np.zeros((vocabs, 300))
for word, i in tokenizer.word_index.items():  
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:  
        embedding_matrix[i] = embedding_vector


In [None]:
embedding_matrix.shape

### construct model and train

In [None]:
def bidirectional_gru(): 
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_dim, input_length = max_length)) 
    model.add(Bidirectional(GRU(150, return_sequences = True))) 
    model.add(Dropout(0.2))
    model.add(GRU(100)) 
    model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
    model.add(Dense(5, activation = 'softmax')) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy']) 
    return model 

In [None]:
def simple_lstm(): 
    model = Sequential() 
    model.add(Embedding(47121, 300, weights = [embedding_matrix], input_length = 500, trainable = True)) 
    model.add(Dropout(0.5))
    model.add(Conv1D(128, 5, padding = 'valid', activation = 'relu')) 
    model.add(Conv1D(128, 5, padding = 'valid', activation = 'relu')) 
    model.add(MaxPooling1D(pool_size = 4))  
    model.add(LSTM(55)) 
    model.add(Dense(5, activation = 'softmax')) 
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy']) 
    return model 

In [16]:
def bidirectional_lstm():
    model = Sequential() 
    model.add(Embedding(input_dim = features, output_dim = embedding_dim, input_length = max_tokens)) 
    model.add(Bidirectional(LSTM(16, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1))) 
    model.add(Bidirectional(LSTM(8, return_sequences = False))) 
    model.add(Dropout(0.1)) 
    model.add(Dense(5, activation = 'softmax')) 
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = Adam(lr=0.01), metrics = ['accuracy'])
    return model

In [17]:
# conduct KFold Ensemble  
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 777) 
for idx, (train_idx,val_idx) in enumerate(kfold.split(train_padded, y_train)):
    
    print("... Validating on Fold {} ...".format(idx+1))
    
    # split data into train and validation sets 
    cur_x_train, cur_x_val = train_padded[train_idx], train_padded[val_idx] 
    cur_y_train, cur_y_val = y_train[train_idx], y_train[val_idx] 
    
    # build model, define callbacks and train  
    model_path = './storage/writer_trainfiles2/kfold' + str(idx+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
    model = bidirectional_lstm() 
    learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 1, verbose = 1, factor = 0.8)
    checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 3) 
    history = model.fit(cur_x_train,
                        cur_y_train,
                        validation_data = (cur_x_val,cur_y_val),
                        shuffle = True,
                        batch_size = 256, 
                        epochs = 250,
                        verbose = 1,
                        callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

... Validating on Fold 1 ...
Train on 43903 samples, validate on 10976 samples
Epoch 1/250
Epoch 00001: val_loss improved from inf to 0.95022, saving model to ./storage/writer_trainfiles2/kfold1/epoch_001_val_0.950.h5
Epoch 2/250
Epoch 00002: val_loss improved from 0.95022 to 0.78328, saving model to ./storage/writer_trainfiles2/kfold1/epoch_002_val_0.783.h5
Epoch 3/250
Epoch 00003: val_loss improved from 0.78328 to 0.76300, saving model to ./storage/writer_trainfiles2/kfold1/epoch_003_val_0.763.h5
Epoch 4/250
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.007999999821186066.

Epoch 00004: val_loss did not improve from 0.76300
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.006399999558925629.

Epoch 00005: val_loss did not improve from 0.76300
Epoch 6/250
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0051199994981288915.

Epoch 00006: val_loss did not improve from 0.76300
... Validating on Fold 2 ...
Train on 43903 samples, validate on 1

### Make predictions

In [18]:
model1 = load_model('./storage/writer_trainfiles2/kfold1/epoch_004_val_0.752.h5')
model2 = load_model('./storage/writer_trainfiles2/kfold1/epoch_003_val_0.763.h5')
model3 = load_model('./storage/writer_trainfiles2/kfold2/epoch_003_val_0.733.h5')
model4 = load_model('./storage/writer_trainfiles2/kfold3/epoch_003_val_0.737.h5')
model5 = load_model('./storage/writer_trainfiles2/kfold4/epoch_003_val_0.770.h5')
model6 = load_model('./storage/writer_trainfiles2/kfold5/epoch_002_val_0.777.h5')   

In [20]:
#pred1 = model1.predict_proba(test_padded)
pred2 = model2.predict_proba(test_padded) 
pred3 = model3.predict_proba(test_padded) 
pred4 = model4.predict_proba(test_padded) 
pred5 = model5.predict_proba(test_padded) 
pred6 = model6.predict_proba(test_padded) 

In [21]:
pred_avg = (pred2 + pred3 + pred4 + pred5 + pred6)/5.0  

In [22]:
ss[['0','1','2','3','4']] = pred_avg 
ss.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0.068966,0.530573,0.205267,0.121267,0.073928
1,1,0.170795,0.509856,0.077767,0.062893,0.178689
2,2,0.747569,0.216619,0.018814,0.010077,0.006922
3,3,0.004944,0.001761,0.968447,0.003451,0.021397
4,4,0.408807,0.347093,0.024943,0.114185,0.104972


In [23]:
ss.to_csv('./storage/bidirectional_lstm.csv', index = False, encoding = 'utf-8') 