In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from gensim.models import KeyedVectors
from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
import googletrans 
from googletrans import Translator



In [2]:
train = pd.read_csv('./storage/writer/train.csv') 
test = pd.read_csv('./storage/writer/test_x.csv') 
ss = pd.read_csv('./storage/writer/sample_submission.csv') 

In [3]:
x_train = train['text'] 
y_train = train['author'] 
x_test = test['text']

In [4]:
x_train = np.asarray(x_train) 
y_train = np.asarray(y_train) 
x_test = np.asarray(x_test)

In [5]:
x_train.shape, y_train.shape, x_test.shape 

((54879,), (54879,), (19617,))

In [6]:
back_translated = np.load('./storage/sample_back_translate.npy') 

back_translated.shape

(54879,)

In [7]:
def build_model(): 
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 

In [8]:
# conduct KFold Ensemble  
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 777) 
for idx, (train_idx,val_idx) in enumerate(kfold.split(x_train, y_train)):
    print("... Iteration {} ...".format(idx+1))   
    
    print("... Preprocessing Data ... ")
    
    cur_x_train, cur_x_val = x_train[train_idx], x_train[val_idx] 
    cur_y_train, cur_y_val = y_train[train_idx], y_train[val_idx]
    
    cur_x_train = np.concatenate([cur_x_train, back_translated[train_idx]]) 
    cur_y_train = np.concatenate([cur_y_train, cur_y_train])     
    
    # create tokenizer instance 
    vocab_size = 25000
    maxlength = 256 
    embedding_dim = 20 

    tokenizer = Tokenizer(num_words = vocab_size, lower = True)
    tokenizer.fit_on_texts(cur_x_train) # fit on entire train data 
    
    train_sequences = tokenizer.texts_to_sequences(cur_x_train)
    train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlength)   
    
    val_sequences = tokenizer.texts_to_sequences(cur_x_val)
    val_padded = pad_sequences(val_sequences, padding='post', maxlen=maxlength)
    
    # create padded sequence for test data  
    test_sequences = tokenizer.texts_to_sequences(x_test)
    test_padded = pad_sequences(test_sequences, padding='post', maxlen=maxlength)
    np.save('./storage/test_padded_fold' + str(idx+1) + '.npy', test_padded)
    
    print(train_padded.shape, cur_y_train.shape) 
    print(val_padded.shape, cur_y_val.shape)

    
    print("... Training Model by Validating on Fold {} ...".format(idx+1))

    # build model, define callbacks and train  
    model_path = './storage/writer_train_10/kfold' + str(idx+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
    model = build_model() 
    learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 1, verbose = 1, factor = 0.75)
    checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
    history = model.fit(train_padded,
                        cur_y_train,
                        validation_data = (val_padded,cur_y_val),
                        shuffle = True,
                        batch_size = 256, 
                        epochs = 250,
                        verbose = 1,
                        callbacks = [learning_rate_reduction, checkpoint, early_stopping])

... Iteration 1 ...
... Preprocessing Data ... 
(98782, 256) (98782,)
(5488, 256) (5488,)
... Training Model by Validating on Fold 1 ...
Train on 98782 samples, validate on 5488 samples
Epoch 1/250
Epoch 00001: val_loss improved from inf to 1.53766, saving model to ./storage/writer_train_10/kfold1/epoch_001_val_1.538.h5
Epoch 2/250
Epoch 00002: val_loss improved from 1.53766 to 1.46865, saving model to ./storage/writer_train_10/kfold1/epoch_002_val_1.469.h5
Epoch 3/250
Epoch 00003: val_loss improved from 1.46865 to 1.37641, saving model to ./storage/writer_train_10/kfold1/epoch_003_val_1.376.h5
Epoch 4/250
Epoch 00004: val_loss improved from 1.37641 to 1.28243, saving model to ./storage/writer_train_10/kfold1/epoch_004_val_1.282.h5
Epoch 5/250
Epoch 00005: val_loss improved from 1.28243 to 1.19476, saving model to ./storage/writer_train_10/kfold1/epoch_005_val_1.195.h5
Epoch 6/250
Epoch 00006: val_loss improved from 1.19476 to 1.11860, saving model to ./storage/writer_train_10/kfold1/e

### Make predictions

In [10]:
model1 = load_model('./storage/writer_train_10/kfold1/epoch_053_val_0.577.h5') 
model2 = load_model('./storage/writer_train_10/kfold2/epoch_046_val_0.593.h5') 
model3 = load_model('./storage/writer_train_10/kfold3/epoch_046_val_0.594.h5')
model4 = load_model('./storage/writer_train_10/kfold4/epoch_048_val_0.608.h5')
model5 = load_model('./storage/writer_train_10/kfold5/epoch_057_val_0.574.h5')
model6 = load_model('./storage/writer_train_10/kfold6/epoch_056_val_0.571.h5')
model7 = load_model('./storage/writer_train_10/kfold7/epoch_051_val_0.620.h5') 
model8 = load_model('./storage/writer_train_10/kfold8/epoch_057_val_0.586.h5')
model9 = load_model('./storage/writer_train_10/kfold9/epoch_053_val_0.581.h5') 
model10 = load_model('./storage/writer_train_10/kfold10/epoch_049_val_0.585.h5')

In [11]:
test_padded1 = np.load('./storage/test_padded_fold1.npy') 
test_padded2 = np.load('./storage/test_padded_fold2.npy')
test_padded3 = np.load('./storage/test_padded_fold3.npy')
test_padded4 = np.load('./storage/test_padded_fold4.npy')
test_padded5 = np.load('./storage/test_padded_fold5.npy')
test_padded6 = np.load('./storage/test_padded_fold6.npy')
test_padded7 = np.load('./storage/test_padded_fold7.npy')
test_padded8 = np.load('./storage/test_padded_fold8.npy')
test_padded9 = np.load('./storage/test_padded_fold9.npy')
test_padded10 = np.load('./storage/test_padded_fold10.npy')

In [12]:
pred1 = model1.predict_proba(test_padded1)
pred2 = model2.predict_proba(test_padded2)
pred3 = model3.predict_proba(test_padded3)
pred4 = model4.predict_proba(test_padded4)
pred5 = model5.predict_proba(test_padded5) 
pred6 = model6.predict_proba(test_padded6) 
pred7 = model7.predict_proba(test_padded7) 
pred8 = model8.predict_proba(test_padded8) 
pred9 = model9.predict_proba(test_padded9) 
pred10 = model10.predict_proba(test_padded10) 

In [13]:
pred_avg = (pred1 + pred2 + pred3 + pred4 + pred5 + pred6 + pred7 + pred8 + pred9 + pred10)/10.0 
ss[['0','1','2','3','4']] = pred_avg
ss.head() 

Unnamed: 0,index,0,1,2,3,4
0,0,0.0005025932,0.9215701,0.071175,0.006643879,0.000108
1,1,0.1035134,0.2883886,0.12417,0.03474397,0.449185
2,2,0.9987903,0.0004544915,0.00025,7.052136e-06,0.000498
3,3,1.482904e-07,1.043416e-16,0.999989,4.1557e-17,1.1e-05
4,4,0.9481353,0.02229575,0.00105,0.02760762,0.000911


In [14]:
ss.to_csv('./storage/MarianNMT_augmented.csv',index=False)