In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from sklearn.feature_extraction.text import CountVectorizer

from konlpy.tag import Mecab 


we load the data 

In [2]:
train = pd.read_csv('./storage/fintech_nlp/news_train.csv')
test = pd.read_csv('./storage/fintech_nlp/news_test.csv') 
submission = pd.read_csv('./storage/fintech_nlp/sample_submission.csv')

We load the preprocessed data for training and testing

In [3]:
y_train = np.load('./storage/fintech_nlp/y_train.npy') 
train_title = np.load('./storage/fintech_nlp/train_title_padded_x.npy')
train_content = np.load('./storage/fintech_nlp/train_content_padded_x.npy')
test_content = np.load('./storage/fintech_nlp/test_content_padded.npy') 
test_title = np.load('./storage/fintech_nlp/test_title_padded.npy') 

train_title.shape, train_content.shape, test_content.shape, test_title.shape

((118745, 23), (118745, 61), (142565, 61), (142565, 23))

In [4]:
vocab_content = 41573 
vocab_title = 9197 
embedding_vec_title = 16 
embedding_vec_content = 64

title_length = 23 
content_length = 61

In [5]:
def build_model():  
    input_title = Input((title_length))
    input_content = Input((content_length)) 
    
    emb_title = Embedding(vocab_title,embedding_vec_title)(input_title)
    lstm_title = Bidirectional(LSTM(128, return_sequences=False))(emb_title)

    emb_text = Embedding(vocab_content,embedding_vec_content)(input_content)
    lstm_text = Bidirectional(LSTM(128, return_sequences=True))(emb_text)
    max_pool_text = GlobalMaxPool1D()(lstm_text)
    dropout_1_text = Dropout(0.1)(max_pool_text)
    dense_1_text = Dense(64, activation='relu')(dropout_1_text)
    dropout_2_text = Dropout(0.1)(dense_1_text)

    out = concatenate([lstm_title,dropout_2_text],axis=-1)
    output=Dense(1, activation='sigmoid')(out)

    model = Model(inputs=[input_title, input_content], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 


Let us try a k-fold ensemble instead. 

K = 5 seems to be a good balance. 

In [8]:
def k_fold(k, files): 
    folds = [] 
    fold_size = len(files)//k 
    for i in range(k): 
        if i == k-1:  
            l = files[i*fold_size:]
        else: 
            l = files[i*fold_size:(i+1)*fold_size] 
        folds.append(l) 
    return folds

train_title, train_content, y_train = shuffle(train_title, train_content, y_train)
K = 5 
train_title_folds = k_fold(K, train_title) 
train_content_folds = k_fold(K, train_content)
train_y_folds = k_fold(K, y_train)

# conduct training 
for t in range(K):
    print("*** Validating on Fold {} ***".format(t+1))
    cur_val_title = train_title_folds[t] 
    cur_val_content = train_content_folds[t] 
    cur_val_y = train_y_folds[t] 
    
    cur_train_title_folds = train_title_folds[0:t] + train_title_folds[t+1:] 
    cur_train_content_folds = train_content_folds[0:t] + train_content_folds[t+1:] 
    cur_train_y_folds = train_y_folds[0:t] + train_y_folds[t+1:] 
    
    cur_train_title = [] 
    for fold in cur_train_title_folds: 
        for data in fold: 
            cur_train_title.append(data) 
    cur_train_title = np.asarray(cur_train_title) 
    
    cur_train_content = []
    for fold in cur_train_content_folds: 
        for data in fold: 
            cur_train_content.append(data) 
    cur_train_content = np.asarray(cur_train_content) 
    
    cur_train_y = [] 
    for fold in cur_train_y_folds: 
        for data in fold:  
            cur_train_y.append(data) 
    cur_train_y = np.asarray(cur_train_y) 
    
    model_path = './storage/fintech_nlp_folds/kfold' + str(t+1) + '/epoch_{epoch:03d}_val_{val_loss:.3f}.h5' 
    learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
    checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', patience = 9) 
    
    model = build_model() 
    history = model.fit(x=[cur_train_title, cur_train_content],
                        y=cur_train_y,
                        batch_size=128,
                        epochs=200,
                        validation_data=([cur_val_title, cur_val_content],cur_val_y),
                        verbose = 1,
                        callbacks = [learning_rate_reduction, checkpoint, early_stopping]) 
    

*** Validating on Fold 1 ***
Train on 94996 samples, validate on 23749 samples
Epoch 1/200
Epoch 00001: val_loss improved from inf to 0.03222, saving model to ./storage/fintech_nlp_folds/kfold1/epoch_001_val_0.032.h5
Epoch 2/200
Epoch 00002: val_loss did not improve from 0.03222
Epoch 3/200
Epoch 00003: val_loss improved from 0.03222 to 0.02703, saving model to ./storage/fintech_nlp_folds/kfold1/epoch_003_val_0.027.h5
Epoch 4/200
Epoch 00004: val_loss did not improve from 0.02703
Epoch 5/200
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00005: val_loss did not improve from 0.02703
Epoch 6/200
Epoch 00006: val_loss did not improve from 0.02703
Epoch 7/200
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 00007: val_loss did not improve from 0.02703
Epoch 8/200
Epoch 00008: val_loss did not improve from 0.02703
Epoch 9/200
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 00

We make the prediction using the trained model and generate submission file

In [9]:
model1 = load_model('./storage/fintech_nlp_folds/kfold1/epoch_003_val_0.027.h5')
model2 = load_model('./storage/fintech_nlp_folds/kfold2/epoch_002_val_0.021.h5')
model3 = load_model('./storage/fintech_nlp_folds/kfold3/epoch_003_val_0.027.h5')
model4 = load_model('./storage/fintech_nlp_folds/kfold4/epoch_002_val_0.030.h5') 
model5 = load_model('./storage/fintech_nlp_folds/kfold5/epoch_002_val_0.024.h5')

In [10]:
pred1 = model1.predict([test_title,test_content])

In [11]:
pred2 = model2.predict([test_title,test_content]) 

In [12]:
pred3 = model3.predict([test_title,test_content])

In [13]:
pred4 = model4.predict([test_title,test_content])

In [14]:
pred5 = model5.predict([test_title,test_content])

In [15]:
predictions = (pred1 + pred2 + pred3 + pred4 + pred5)/5.0 

In [16]:
class_pred = np.where(predictions > 0.5, 1,0).reshape(-1) 

In [17]:
class_pred

array([0, 0, 0, ..., 1, 1, 1])

In [18]:
predictions

array([[3.00649852e-01],
       [1.08420845e-05],
       [2.54368788e-04],
       ...,
       [9.99999225e-01],
       [9.99964595e-01],
       [9.99964595e-01]], dtype=float32)

In [19]:
submission['info'] = class_pred 
submission.head()

Unnamed: 0,id,info
0,NEWS00237_1,0
1,NEWS00237_2,0
2,NEWS00237_3,0
3,NEWS00237_4,0
4,NEWS00237_5,0


In [20]:
submission.to_csv('./storage/bidirectional_lstm_5fold.csv', index=False)