In [None]:
#https://arxiv.org/pdf/1404.2188.pdf
#A Convolutional Neural Network for Modelling Sentences


sentences를 다룰 때, cnn을 사용하면 거리가 먼 words 끼리의 sentences가 pooling 될 일이없음.
하지만 k-max pooling을 통해, k개의 active한 값을 뽑아내면서 기존 pooling의 한계 극복

i번째 layer의 k = max(ktop, (L-i)*s/L) #L 네트워크 깊이, 문장길이 s, ktop 최상단 층 convolutional layer




In [1]:
import glob
import pandas as pd
import re
import os
import numpy as np

In [3]:
glob.glob("../../../downloads/glove.6B/*")

['../../../downloads/glove.6B\\glove.6B.100d.txt',
 '../../../downloads/glove.6B\\glove.6B.200d.txt',
 '../../../downloads/glove.6B\\glove.6B.300d.txt',
 '../../../downloads/glove.6B\\glove.6B.50d.txt']

In [4]:
raw_data = pd.read_csv("../../../downloads/toxic/train.csv")

In [5]:
raw_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
raw_data['comment_text'] = raw_data['comment_text'].apply(lambda x : x.replace("\n"," "))

In [7]:
raw_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
labels = raw_data.iloc[:20000,2:]
labels['pure'] = 0

In [7]:
def pure_calcul(x):
    if sum(x) == 0:
        x['pure'] = 1
    return x

In [8]:
labels = labels.apply(lambda x : pure_calcul(x),axis=1)

In [9]:
labels_sum = sum(labels.apply(lambda x : sum(x)))

In [10]:
labels_weights = (labels.apply(lambda x : sum(x))/labels_sum).apply(lambda x : 1/x)

In [11]:
train_labels = labels.iloc[:10000,:]
test_labels = labels.iloc[10000:,:]

In [12]:
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
train_labels = (to_categorical(train_labels))

In [14]:
test_labels = to_categorical(test_labels)

In [15]:
texts = ' '.join(raw_data.iloc[:10000,:]['comment_text'].values)

In [16]:
p = re.compile(r"[\(\[].*?[\)\]]")
pp = re.compile("[^a-zA-Z ]")


In [17]:
texts = re.sub(p,"",texts)
texts = re.sub(pp,"",texts)

In [18]:
texts = texts.replace("  "," ").replace("  "," ").replace("  "," ")

In [90]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [20]:
maxlen = 100
max_words = 10000

#texts가 존재해야함.


tokenizer = Tokenizer()
tokenizer.fit_on_texts([texts])
tokenizer.num_words = max_words
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(raw_data['comment_text'][:10000].values)


data = pad_sequences(sequences, maxlen = maxlen)



In [21]:
embeddings_index = {}
glove_dir = "../../../downloads/glove.6B/glove.6B.300d.txt"
with open(os.path.join(glove_dir),encoding='utf-8') as fp:
    for line in fp:
        values = line.split()
        word = values[0]
        params = np.asarray(values[1:],dtype = 'float32')
        embeddings_index[word] = params


In [33]:
len(embeddings_index.keys())

400000

In [102]:
len(word_index)

36593

In [22]:
embedding_dim = 300

matrix_size = min(len(word_index),max_words)

embedding_matrix = np.zeros((len(word_index)+1,embedding_dim))

for word, i in word_index.items(): #
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i <= matrix_size:
        embedding_matrix[i] = embedding_vector

In [23]:
from keras.layers import Embedding, Flatten,Dense,Input,Conv1D,Flatten,MaxPooling1D,Concatenate
from keras.models import Model

In [172]:
input_layer = Input(shape = (maxlen,))
embedding_layer = Embedding(len(word_index) + 1,embedding_dim,weights =[embedding_matrix],\
                           input_length = maxlen, trainable = False)(input_layer)
x1 = Conv1D(128,1, activation = 'relu')(embedding_layer)
x2 = Conv1D(128,2, activation = 'relu')(embedding_layer)
x3 = Conv1D(128,3, activation = 'relu')(embedding_layer)
x4 = Conv1D(128,4, activation = 'relu')(embedding_layer)
x5 = Conv1D(128,5, activation = 'relu')(embedding_layer)

x1 = MaxPooling1D(2)(x1)
x2 = MaxPooling1D(2)(x2)
x3 = MaxPooling1D(2)(x3)
x4 = MaxPooling1D(2)(x4)
x5 = MaxPooling1D(2)(x5)

x1 = Conv1D(128,1,activation = 'relu')(x1)
x2 = Conv1D(128,2,activation = 'relu')(x2)
x3 = Conv1D(128,3,activation = 'relu')(x3)
x4 = Conv1D(128,4,activation = 'relu')(x4)
x5 = Conv1D(128,5,activation = 'relu')(x5)

x1 = MaxPooling1D(2)(x1)
x2 = MaxPooling1D(2)(x2)
x3 = MaxPooling1D(2)(x3)
x4 = MaxPooling1D(2)(x4)
x5 = MaxPooling1D(2)(x5)

x1 = Conv1D(128,1,activation = 'relu')(x1)
x2 = Conv1D(128,2,activation = 'relu')(x2)
x3 = Conv1D(128,3,activation = 'relu')(x3)
x4 = Conv1D(128,4,activation = 'relu')(x4)
x5 = Conv1D(128,5,activation = 'relu')(x5)

x1 = Flatten()(x1)
x2 = Flatten()(x2)
x3 = Flatten()(x3)
x4 = Flatten()(x4)
x5 = Flatten()(x5)

x = Concatenate()([x1,x2,x3,x4,x5])

x = Dense(128,activation = 'relu')(x)
#x = Dense(len(labels.columns),activation = 'softmax')(x)

toxic = Dense(2,activation = 'softmax',name = 'toxic')(x)
severe_toxic = Dense(2,activation = 'softmax',name = 'severe_toxic')(x)
obscene = Dense(2,activation = 'softmax',name = 'obscene')(x)
threat = Dense(2,activation = 'softmax',name = 'threat')(x)
insult = Dense(2,activation = 'softmax',name = 'insult')(x)
identity_hate = Dense(2,activation = 'softmax',name = 'identity_hate')(x)
pure = Dense(2,activation = 'softmax',name = 'pure')(x)

model = Model(input_layer, [toxic,severe_toxic,obscene,threat,insult,identity_hate,pure])
model.compile(loss = 'binary_crossentropy',loss_weights = labels_weights.to_dict(), optimizer = 'adam', metrics = ['accuracy'])

{'toxic': 11.533127889060092,
 'severe_toxic': 101.14864864864865,
 'obscene': 20.84958217270195,
 'threat': 335.14925373134326,
 'insult': 21.993143976493634,
 'identity_hate': 124.06077348066299,
 'pure': 1.2516722408026757}

In [144]:
raw_data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [115]:
data = np.array(data)

In [191]:
train_labels.shape

(10000, 7, 2)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
model.fit(data,[train_labels[:,0,:],train_labels[:,1,:],train_labels[:,2,:],train_labels[:,3,:],train_labels[:,4,:]\
               ,train_labels[:,5,:],train_labels[:,6,:]])

toxic             971
severe_toxic      101
obscene           527
threat             33
insult            494
identity_hate      84
pure             8970
dtype: int64

In [193]:
test_sequences = tokenizer.texts_to_sequences(raw_data['comment_text'][10000:20000].values)


test_data = pad_sequences(test_sequences, maxlen = maxlen)


In [194]:
test_data = np.array(test_data)

In [195]:
predict_data = model.predict(test_data)

In [93]:
from sklearn.metrics import accuracy_score

In [264]:
raw_data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [265]:
toxic = np.argmax(np.array(predict_data)[0],axis=1)
toxic = to_categorical(toxic)
severe_toxic = np.argmax(np.array(predict_data)[1],axis=1)
severe_toxic = to_categorical(severe_toxic)
obscene = np.argmax(np.array(predict_data)[2],axis=1)
obscene = to_categorical(obscene)
threat = np.argmax(np.array(predict_data)[3],axis=1)
threat = to_categorical(threat)
insult = np.argmax(np.array(predict_data)[4],axis=1)
insult = to_categorical(insult)
identity_hate = np.argmax(np.array(predict_data)[5],axis=1)
identity_hate = to_categorical(identity_hate)
pure = np.argmax(np.array(predict_data)[6],axis=1)
pure = to_categorical(pure)

In [284]:
raw_data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [282]:
to_categorical(test_labels['toxic'])a

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [312]:
plus = np.zeros((10000,1))
a1=(accuracy_score(toxic,to_categorical(test_labels['toxic'])))
a2=(accuracy_score(severe_toxic,to_categorical(test_labels['severe_toxic'])))
a3=(accuracy_score(obscene,to_categorical(test_labels['obscene'])))
a4=(accuracy_score(np.concatenate([threat,plus],axis=1),to_categorical(test_labels['threat'])))
a5=(accuracy_score(insult,to_categorical(test_labels['insult'])))
a6=(accuracy_score(np.concatenate([identity_hate,plus],axis=1),to_categorical(test_labels['identity_hate'])))
a7=(accuracy_score(pure,to_categorical(test_labels['pure'])))

In [314]:
result = np.mean([a1,a2,a3,a4,a5,a6,a7])
result

0.9671285714285712

In [None]:
#위의 측정방식은 ROC커브를 그려봐야 확실한 측정이 될 수 있다. loss_weights을 조정했음에도 한쪽에 몰빵한 신경망도 나옴
#단순히 얼마 대충올랐나 보려고 하는거라 측정방식에 디테일을 쏟지못함

36593

In [316]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 300)     10978200    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_91 (Conv1D)              (None, 100, 128)     38528       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_92 (Conv1D)              (None, 99, 128)      76928       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_93 

In [28]:
from keras.engine.topology import Layer
from keras import backend as K
from keras.utils import conv_utils

In [24]:
#wide pooling은 의미 없는 것 같음.
#왜냐면 맨앞 맨뒤의 정보만 개별적으로 하나 넣어주는 효과밖에 안될듯 하지만 구현은 해봄

In [132]:
class WidePooling1D(Layer):
    def __init__(self,pool_size,padding='valid',**kwargs):
        self.pool_size = pool_size
        super(WidePooling1D,self).__init__(**kwargs)

    def call(self,inputs):
        print(K.shape(inputs))
        print(K.shape)
        empty_place = K.zeros((,self.pool_size,inputs.shape[2]))
        #empty_place = K.expand_dims(empty_place,axis=0)
        inputs = K.concatenate([empty_place,inputs,empty_place],axis=1)
        
        return inputs
    def compute_output_shape(self, input_shape):
        return (input_shape[0],input_shape[1]+(2*self.pool_size),input_shape[2])

In [133]:
input_layer = Input(shape = (maxlen,))
embedding_layer = Embedding(len(word_index) + 1,embedding_dim,weights =[embedding_matrix],\
                           input_length = maxlen, trainable = False)(input_layer)
x1 = Conv1D(128,1, activation = 'relu')(embedding_layer)
x2 = WidePooling1D(pool_size = 3)(x1)
x3 = MaxPooling1D(pool_size = 3,strides=1,name='test')(x2)
x4 = Conv1D(128,1,activation = 'relu')(x3)
x5 = MaxPooling1D(pool_size = 2,strides=1)(x4)
x6 = Conv1D(128,1,activation = 'relu')(x5)
x7 = Flatten()(x6)
x = Dense(128,activation = 'relu')(x7)
#x = Dense(len(labels.columns),activation = 'softmax')(x)
x = Dense(2,activation = 'softmax')(x)
model = Model(input_layer, x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

Tensor("wide_pooling1d_14/Shape:0", shape=(3,), dtype=int32)


In [136]:
model.layers[1].o

<keras.layers.embeddings.Embedding at 0x1f2cb5f2e48>

In [130]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 100, 300)          10978200  
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 100, 128)          38528     
_________________________________________________________________
wide_pooling1d_13 (WidePooli (None, 106, 128)          0         
_________________________________________________________________
test (MaxPooling1D)          (None, 104, 128)          0         
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 104, 128)          16512     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 103, 128)          0         
__________

In [131]:
model.fit(data,[train_labels[:,0,:]],batch_size = 64)

Epoch 1/1

InvalidArgumentError: Incompatible shapes: [64,2] vs. [16,2]
	 [[{{node training_4/Adam/gradients/loss_6/dense_14_loss/logistic_loss/mul_grad/BroadcastGradientArgs}} = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@train...ad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](training_4/Adam/gradients/loss_6/dense_14_loss/logistic_loss/mul_grad/Shape, training_4/Adam/gradients/loss_6/dense_14_loss/logistic_loss/mul_grad/Shape_1)]]

In [100]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          10978200  
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 100, 128)          38528     
_________________________________________________________________
wide_pooling1d_3 (WidePoolin (None, 106, 128)          0         
_________________________________________________________________
test (MaxPooling1D)          (None, 104, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 104, 128)          16512     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 103, 128)          0         
__________

In [48]:
model.layers[4].output

<tf.Tensor 'max_pooling1d_3/Squeeze:0' shape=(1, 104, 128) dtype=float32>

In [54]:
model.get_layer("test").output

<tf.Tensor 'test/Squeeze:0' shape=(1, 104, 128) dtype=float32>

In [39]:
l_test = K.function([input_layer],[model.layers[4].output])

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,  608,   75,    1,  132,
        114,  175,   28,  581, 4060, 1144,   79,  297,   52,   50, 6427,
         15,   64, 4364,  143,    7, 3571,   34,  118, 1169, 8834, 2703,
          5,   48,  248,    1,  457,   32,    1,   56,   27,  140, 3800,
         91])

In [62]:
want_to_know = l_test([data[:1]])[0]

In [65]:
want_to_know.shape

(1, 104, 128)

In [88]:
want_to_know[0][100]

array([0.62054145, 0.07677186, 0.43080768, 0.42186567, 0.00875566,
       0.        , 0.4790943 , 0.09477723, 0.7386753 , 0.68470424,
       0.6888691 , 1.0725573 , 0.2181289 , 0.08093468, 0.7286806 ,
       0.12981373, 0.        , 0.12720165, 0.2544409 , 0.32327625,
       0.        , 1.0854882 , 0.05779046, 0.6074182 , 0.680281  ,
       0.12557766, 0.3812653 , 0.00566218, 0.71193725, 0.8322672 ,
       0.91000146, 0.12353578, 0.40618366, 0.        , 0.        ,
       0.46752194, 0.51196903, 0.        , 0.6799803 , 0.7380056 ,
       0.93234026, 0.14133693, 0.22339752, 1.1934353 , 0.5128488 ,
       1.05489   , 0.19207083, 0.        , 0.4545156 , 0.79230857,
       0.        , 0.8980906 , 0.25274253, 0.10073692, 0.        ,
       0.        , 0.40694895, 0.09699701, 0.        , 0.50496227,
       0.40818778, 0.        , 0.36311767, 0.7864307 , 0.2881524 ,
       0.        , 0.18948539, 0.22644764, 0.        , 0.        ,
       0.59630674, 0.4087972 , 0.12323816, 0.3392834 , 0.79905

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          10978200  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 128)          38528     
_________________________________________________________________
wide_pooling1d_1 (WidePoolin (None, 106, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 104, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 104, 128)          16512     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 103, 128)          0         
__________

In [320]:
class KmaxPooling1D(Layer):
    def __init__(self,pool_size,padding='valid',**kwargs):
        self.pool_size = pool_size
        super(KmaxPooling1D,self).__init__(**kwargs)

    def call(self,inputs):
        inputs = np.sort(inputs,axis=1)
        
        #empty_place = K.zeros((1,self.pool_size,inputs.shape[2]))
        #inputs = K.concatenate([empty_place,inputs,empty_place],axis=1)
        
        return inputs
    def compute_output_shape(self, input_shape):
        return  #(input_shape[0],input_shape[1]+(2*self.pool_size),input_shape[2])

In [32]:
np.sort([1,2,3])

array([1, 2, 3])

In [36]:
np.sort(np.array([[2,1,6],[0,7,4],[5,3,2]]),axis=1)

array([[1, 2, 6],
       [0, 4, 7],
       [2, 3, 5]])

In [None]:
K