In [1]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from sklearn.feature_extraction.text import CountVectorizer

from konlpy.tag import Mecab 


we load the data 

In [2]:
train = pd.read_csv('./storage/fintech_nlp/news_train.csv')
test = pd.read_csv('./storage/fintech_nlp/news_test.csv') 
submission = pd.read_csv('./storage/fintech_nlp/sample_submission.csv')

We load the preprocessed data for training and testing

In [3]:
y_train = np.load('./storage/fintech_nlp/y_train.npy') 
train_title = np.load('./storage/fintech_nlp/train_title_padded_x.npy')
train_content = np.load('./storage/fintech_nlp/train_content_padded_x.npy')
test_content = np.load('./storage/fintech_nlp/test_content_padded.npy') 
test_title = np.load('./storage/fintech_nlp/test_title_padded.npy') 

train_title.shape, train_content.shape, test_content.shape, test_title.shape

((118745, 23), (118745, 61), (142565, 61), (142565, 23))

In [4]:
vocab_content = 41573 
vocab_title = 9197 
embedding_vec_title = 16 
embedding_vec_content = 64

title_length = 23 
content_length = 61

In [5]:
def build_model():  
    input_title = Input((title_length))
    input_content = Input((content_length)) 
    
    emb_title = Embedding(vocab_title,embedding_vec_title)(input_title)
    lstm_title = Bidirectional(LSTM(128, return_sequences=False))(emb_title)

    emb_text = Embedding(vocab_content,embedding_vec_content)(input_content)
    lstm_text = Bidirectional(LSTM(128, return_sequences=True))(emb_text)
    max_pool_text = GlobalMaxPool1D()(lstm_text)
    dropout_1_text = Dropout(0.1)(max_pool_text)
    dense_1_text = Dense(64, activation='relu')(dropout_1_text)
    dropout_2_text = Dropout(0.1)(dense_1_text)

    out = concatenate([lstm_title,dropout_2_text],axis=-1)
    output=Dense(1, activation='sigmoid')(out)

    model = Model(inputs=[input_title, input_content], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 

model = build_model() 
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 61)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 61, 64)       2660672     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 61, 256)      197632      embedding_1[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 256)          0           bidirectional_1[0][0]            
______________________________________________________________________________________________

In [8]:
model_path = './storage/fintech_nlp_first_submission/epoch_{epoch:03d}_val_{val_loss:.3f}.h5'
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, verbose = 1, factor = 0.5)
checkpoint = ModelCheckpoint(filepath = model_path, monitor = 'val_loss', verbose = 1, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10) 

history = model.fit(x=[train_title,train_content],
                    y=y_train,
                    batch_size=128,
                    epochs=200,
                    verbose=1,
                    validation_split = 0.2, 
                    callbacks = [learning_rate_reduction,checkpoint,early_stopping])

Train on 94996 samples, validate on 23749 samples
Epoch 1/200
Epoch 00001: val_loss improved from inf to 0.08159, saving model to ./storage/fintech_nlp_first_submission/epoch_001_val_0.082.h5
Epoch 2/200
Epoch 00002: val_loss improved from 0.08159 to 0.06970, saving model to ./storage/fintech_nlp_first_submission/epoch_002_val_0.070.h5
Epoch 3/200
Epoch 00003: val_loss did not improve from 0.06970
Epoch 4/200
Epoch 00004: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 00004: val_loss did not improve from 0.06970
Epoch 5/200
Epoch 00005: val_loss did not improve from 0.06970
Epoch 6/200
Epoch 00006: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.

Epoch 00006: val_loss did not improve from 0.06970
Epoch 7/200
Epoch 00007: val_loss did not improve from 0.06970
Epoch 8/200
Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.

Epoch 00008: val_loss did not improve from 0.06970
Epoch 9/200
Epoch 00009: val_loss did not

We make the prediction using the trained model and generate submission file

In [5]:
best_model = load_model('./storage/fintech_nlp_first_submission/epoch_002_val_0.028.h5')

In [19]:
predictions = best_model.predict([test_title,test_content])

In [25]:
class_pred = np.where(predictions > 0.5, 1,0).reshape(-1) 

In [26]:
class_pred

array([0, 0, 0, ..., 1, 1, 1])

In [27]:
predictions

array([[3.5989198e-01],
       [1.3560057e-04],
       [6.8873167e-05],
       ...,
       [9.9999988e-01],
       [9.9999404e-01],
       [9.9999404e-01]], dtype=float32)

In [28]:
submission.head()

Unnamed: 0,id,info
0,NEWS00237_1,0.359892
1,NEWS00237_2,0.000136
2,NEWS00237_3,6.9e-05
3,NEWS00237_4,2.3e-05
4,NEWS00237_5,0.000229


In [29]:
submission['info'] = class_pred 
submission.head()

Unnamed: 0,id,info
0,NEWS00237_1,0
1,NEWS00237_2,0
2,NEWS00237_3,0
3,NEWS00237_4,0
4,NEWS00237_5,0


In [30]:
submission.to_csv('./storage/bidirectional_lstm_first_submission_classes.csv', index=False)