In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from gensim.models import KeyedVectors
from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
import googletrans 
from googletrans import Translator



In [2]:
train = pd.read_csv('./storage/writer/train.csv') 
test = pd.read_csv('./storage/writer/test_x.csv') 
ss = pd.read_csv('./storage/writer/sample_submission.csv') 

In [3]:
x_train = train['text'] 
y_train = train['author'] 
x_test = test['text']

In [4]:
x_train = np.asarray(x_train) 
y_train = np.asarray(y_train) 
x_test = np.asarray(x_test)

In [5]:
x_train.shape, y_train.shape, x_test.shape 

((54879,), (54879,), (19617,))

In [8]:
backtrans_xtrain = [] 
translator = Translator()
for i in range(30):   
    translated = translator.translate(x_train[i], dest = 'fr') 
    backtranslated = translator.translate(translated.text, dest = 'en') 
    backtrans_xtrain.append(backtranslated.text)

In [9]:
backtrans_xtrain = np.asarray(backtrans_xtrain) 

x_train = np.concatenate([x_train, backtrans_xtrain]) 
y_train = np.concatenate([y_train, y_train[:30]]) 

In [10]:
x_train.shape, y_train.shape

((54909,), (54909,))

In [11]:
vocab_size = 20000 
maxlength = 256 
embedding_dim = 20 

tokenizer = Tokenizer(num_words = vocab_size, lower = True)
tokenizer.fit_on_texts(x_train)

In [12]:
train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=maxlength)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=maxlength)


In [13]:
def build_model(): 
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 

In [14]:
# conduct KFold Ensemble  
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 777) 
for idx, (train_idx,val_idx) in enumerate(kfold.split(train_padded, y_train)):
    
    print("... Validating on Fold {} ...".format(idx+1))
    
    # split data into train and validation sets 
    cur_x_train, cur_x_val = train_padded[train_idx], train_padded[val_idx] 
    cur_y_train, cur_y_val = y_train[train_idx], y_train[val_idx] 
    
    # build model, define callbacks and train  
    model_path = './storage/writer_trainfiles2/kfold' + str(idx+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
    model = build_model() 
    learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 1, verbose = 1, factor = 0.5)
    checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 
    history = model.fit(cur_x_train,
                        cur_y_train,
                        validation_data = (cur_x_val,cur_y_val),
                        shuffle = True,
                        batch_size = 256, 
                        epochs = 250,
                        verbose = 1,
                        callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 

... Validating on Fold 1 ...
Train on 43927 samples, validate on 10982 samples
Epoch 1/250
Epoch 00001: val_loss improved from inf to 1.55549, saving model to ./storage/writer_trainfiles2/kfold1/epoch_001_val_1.555.h5
Epoch 2/250
Epoch 00002: val_loss improved from 1.55549 to 1.53573, saving model to ./storage/writer_trainfiles2/kfold1/epoch_002_val_1.536.h5
Epoch 3/250
Epoch 00003: val_loss improved from 1.53573 to 1.50547, saving model to ./storage/writer_trainfiles2/kfold1/epoch_003_val_1.505.h5
Epoch 4/250
Epoch 00004: val_loss improved from 1.50547 to 1.46523, saving model to ./storage/writer_trainfiles2/kfold1/epoch_004_val_1.465.h5
Epoch 5/250
Epoch 00005: val_loss improved from 1.46523 to 1.41862, saving model to ./storage/writer_trainfiles2/kfold1/epoch_005_val_1.419.h5
Epoch 6/250
Epoch 00006: val_loss improved from 1.41862 to 1.36958, saving model to ./storage/writer_trainfiles2/kfold1/epoch_006_val_1.370.h5
Epoch 7/250
Epoch 00007: val_loss improved from 1.36958 to 1.32042,

### Make predictions

In [16]:
model1 = load_model('./storage/writer_trainfiles2/kfold1/epoch_093_val_0.606.h5') 
model2 = load_model('./storage/writer_trainfiles2/kfold2/epoch_090_val_0.580.h5') 
model3 = load_model('./storage/writer_trainfiles2/kfold3/epoch_096_val_0.591.h5')
model4 = load_model('./storage/writer_trainfiles2/kfold4/epoch_094_val_0.599.h5')
model5 = load_model('./storage/writer_trainfiles2/kfold5/epoch_086_val_0.601.h5')

In [17]:
pred1 = model1.predict_proba(test_padded)
pred2 = model2.predict_proba(test_padded)
pred3 = model3.predict_proba(test_padded)
pred4 = model4.predict_proba(test_padded)
pred5 = model5.predict_proba(test_padded)

In [19]:
pred_avg = (pred1 + pred2 + pred3 + pred4 + pred5)/5.0 
ss[['0','1','2','3','4']] = pred_avg
ss.head() 


Unnamed: 0,index,0,1,2,3,4
0,0,0.00014658,0.8781241,0.117624,0.004070309,3.6e-05
1,1,0.2121087,0.5559049,0.043457,0.03319281,0.155336
2,2,0.9841048,0.008670364,0.002677,3.614671e-05,0.004512
3,3,5.452764e-07,5.446355e-15,0.999574,5.604425e-16,0.000425
4,4,0.9909714,0.005796687,0.000245,0.002569548,0.000417


In [None]:
ss.to_csv('./storage/')