In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import keras
import pickle

from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, Dropout, SpatialDropout1D, BatchNormalization, Conv1D, MaxPooling1D, Maximum 
from keras.layers import TimeDistributed, ZeroPadding1D
from keras.optimizers import Adam

from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score

from utils import *

Using TensorFlow backend.


# Data Preparation

In [2]:
fold_list = [1,2,3,4,5,6,7,8,9,10]
input_list = ['article','encyclopedia','news','novel']
n_gram = 21
n_gram2 = int((n_gram-1)/2)

fold = 10
train_fold = fold_list
train_fold.remove(fold)
    
df_train = pd.DataFrame(columns = ['char','type','target'])

df_train = pd.DataFrame(columns = ['char','type','target'])
df_test = pd.DataFrame(columns = ['char','type','target'])
  
for input_folder in input_list:
    for number in train_fold:
        df1 = pd.read_csv('../cleaned_data/df_best_{}_{}.csv'.format(input_folder, number), usecols = ['char','type','target'])
        df_train = df_train.append(df1)
    df2 = pd.read_csv('../cleaned_data/df_best_{}_{}.csv'.format(input_folder, fold), usecols = ['char','type','target'])
    df_test = df_test.append(df2)
    
df_train.reset_index(inplace=True, drop=True)  

In [3]:
df_train = add_padding(df_train, n_gram)
df_test = add_padding(df_test, n_gram)

char_le = preprocessing.LabelEncoder()
type_le = preprocessing.LabelEncoder()

listed_char = list(df_train['char'].unique())
all_char = listed_char
all_char.append('other')

char_le.fit(all_char)
type_le.fit(['c','n','v','w','t','s','d','q','p','s_e','b_e','o'])

LabelEncoder()

In [4]:
with open('../weight/object.pk', 'wb') as handle:
    pickle.dump([char_le, type_le, listed_char], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
df_train['char'] = char_le.transform(df_train['char'])
df_train['type'] = type_le.transform(df_train['type'])

to_be_replaced = list(set(df_test['char'].unique()) - set(listed_char))

if len(to_be_replaced) != 0:
    df_test.replace(to_replace=to_be_replaced, value='other', inplace=True)

df_test['char'] = char_le.transform(df_test['char'])
df_test['type'] = type_le.transform(df_test['type'])

In [6]:
df_n_gram_train = create_n_gram_df(df_train, number = n_gram)
df_n_gram_test = create_n_gram_df(df_test, number = n_gram)

In [7]:
char_row = ['char'+str(i+1) for i in range(n_gram2)] + ['char-'+str(i+1) for i in range(n_gram2)] + ['char']
type_row = ['type'+str(i+1) for i in range(n_gram2)] + ['type-'+str(i+1) for i in range(n_gram2)] + ['type']

x_train1 = df_n_gram_train[char_row].as_matrix()
x_train2 = df_n_gram_train[type_row].as_matrix()
y_train = df_n_gram_train['target']

x_test1 = df_n_gram_test[char_row].as_matrix()
x_test2 = df_n_gram_test[type_row].as_matrix()
y_test = df_n_gram_test['target']

# Model Architecture

In [8]:
no_char = len(all_char)

def get_convo_nn(no_word = 200, n_gram=21):
    input1 = Input(shape=(n_gram,))
    input2 = Input(shape=(n_gram,))

    a = Embedding(no_char, 32, input_length=n_gram)(input1)
    a = SpatialDropout1D(0.2)(a)
    
    a2 = Conv1D(no_word, 2, strides=1, padding="valid", activation='relu')(a)
    a2 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a2)
    a2 = ZeroPadding1D(padding=(0,1))(a2)
    
    a3 = Conv1D(no_word, 3, strides=1, padding="valid", activation='relu')(a)
    a3 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a3)
    a3 = ZeroPadding1D(padding=(0,2))(a3)
    
    a4 = Conv1D(no_word, 4, strides=1, padding="valid", activation='relu')(a)
    a4 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a4)
    a4 = ZeroPadding1D(padding=(0,3))(a4)
    
    a5 = Conv1D(no_word, 5, strides=1, padding="valid", activation='relu')(a)
    a5 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a5)
    a5 = ZeroPadding1D(padding=(0,4))(a5)
    
    a6 = Conv1D(no_word, 6, strides=1, padding="valid", activation='relu')(a)
    a6 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a6)
    a6 = ZeroPadding1D(padding=(0,5))(a6)
    
    a7 = Conv1D(no_word, 7, strides=1, padding="valid", activation='relu')(a)
    a7 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a7)
    a7 = ZeroPadding1D(padding=(0,6))(a7)
    
    a8 = Conv1D(no_word, 8, strides=1, padding="valid", activation='relu')(a)
    a8 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a8)
    a8 = ZeroPadding1D(padding=(0,7))(a8)
    
    a9 = Conv1D(no_word-50, 9, strides=1, padding="valid", activation='relu')(a)
    a9 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a9)
    a9 = ZeroPadding1D(padding=(0,8))(a9)
    
    a10 = Conv1D(no_word-50, 10, strides=1, padding="valid", activation='relu')(a)
    a10 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a10)
    a10 = ZeroPadding1D(padding=(0,9))(a10)
    
    a11 = Conv1D(no_word-50, 11, strides=1, padding="valid", activation='relu')(a)
    a11 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a11)
    a11 = ZeroPadding1D(padding=(0,10))(a11)
    
    a12 = Conv1D(no_word-100, 12, strides=1, padding="valid", activation='relu')(a)
    a12 = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(a12)
    a12 = ZeroPadding1D(padding=(0,11))(a12)
    
    
    a_sum = Maximum()([a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12])
    
    b = Embedding(12, 12, input_length=n_gram)(input2)
    b = SpatialDropout1D(0.2)(b) 
    
    x = Concatenate(axis=-1)([a, a_sum, b])
    x = BatchNormalization()(x)
    
    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input1,input2], outputs=out)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
    return model

# Convolutional Neural Network

In [9]:
model = get_convo_nn()

In [10]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 21)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 21, 32)        5696                                         
____________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDrop (None, 21, 32)        0                                            
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 20, 200)       13000                                        
___________________________________________________________________________________________

In [11]:
model.fit([x_train1,x_train2], y_train, epochs=10, batch_size=256, verbose = 2)

Epoch 1/10
1364s - loss: 0.0420 - acc: 0.9852
Epoch 2/10
1373s - loss: 0.0286 - acc: 0.9904
Epoch 3/10
1379s - loss: 0.0254 - acc: 0.9916
Epoch 4/10
1378s - loss: 0.0236 - acc: 0.9922
Epoch 5/10
1377s - loss: 0.0224 - acc: 0.9926
Epoch 6/10
1377s - loss: 0.0216 - acc: 0.9929
Epoch 7/10
1377s - loss: 0.0208 - acc: 0.9931
Epoch 8/10
1377s - loss: 0.0202 - acc: 0.9933
Epoch 9/10
1375s - loss: 0.0197 - acc: 0.9935
Epoch 10/10
1375s - loss: 0.0193 - acc: 0.9936


<keras.callbacks.History at 0x7f0ad24f14a8>

In [12]:
model.fit([x_train1,x_train2], y_train, epochs=3, batch_size=512, verbose = 2)

Epoch 1/3
1066s - loss: 0.0180 - acc: 0.9941
Epoch 2/3
1063s - loss: 0.0176 - acc: 0.9942
Epoch 3/3
1064s - loss: 0.0174 - acc: 0.9942


<keras.callbacks.History at 0x7f0ad20ac198>

In [13]:
model.fit([x_train1,x_train2], y_train, epochs=3, batch_size=2048, verbose = 2)

Epoch 1/3
885s - loss: 0.0161 - acc: 0.9947
Epoch 2/3
883s - loss: 0.0159 - acc: 0.9948
Epoch 3/3
883s - loss: 0.0157 - acc: 0.9948


<keras.callbacks.History at 0x7f0af8eeaac8>

In [14]:
model.fit([x_train1,x_train2], y_train, epochs=3, batch_size=4096, verbose = 2)

Epoch 1/3
850s - loss: 0.0152 - acc: 0.9950
Epoch 2/3
848s - loss: 0.0150 - acc: 0.9950
Epoch 3/3
848s - loss: 0.0149 - acc: 0.9950


<keras.callbacks.History at 0x7f0ad1f89400>

In [15]:
model.fit([x_train1,x_train2], y_train, epochs=3, batch_size=8192, verbose = 2)

Epoch 1/3
846s - loss: 0.0146 - acc: 0.9951
Epoch 2/3
848s - loss: 0.0145 - acc: 0.9952
Epoch 3/3
848s - loss: 0.0144 - acc: 0.9952


<keras.callbacks.History at 0x7f0ad2003860>

In [16]:
y_predict = model.predict([x_test1,x_test2])
y_predict = [(i[0]>0.5)*1 for i in y_predict]

In [17]:
print('f1 score: ', f1_score(y_test.astype(int), y_predict))
print('precision score: ', precision_score(y_test.astype(int), y_predict))
print('recall score: ', recall_score(y_test.astype(int), y_predict))

f1 score:  0.98843661988
precision score:  0.985713702867
recall score:  0.991174622033


In [18]:
model.save_weights("../weight/best_cnn3.h5")