In [162]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils, preprocessing

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.6.0


In [163]:
data = pd.read_csv("ognl_032_imdb_deco.csv")
token_data = '20180813_ognl_symbol_token.pickle'
encode_data = '20180813_ognl_symbol_encode.pickle'
model_data = '20180813_ognl_symbol_model.h5'

In [164]:
data.head()

Unnamed: 0,post,tags
0,"Seeing as how I am a big fan of both ""Fall"" an...",normal
1,/struts_chat/board/board.action?method:#_membe...,attack
2,You could get into the nitty gritty of this fi...,normal
3,I am easily pleased. I like bad films. I like ...,normal
4,/mt/blog/2017/07/s2-048.html,normal


In [165]:
data['post'] = data['post'].replace('[^!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]', '', regex=True)
data['post'] = data['post'].replace('[\ ]*',' ', regex=True)
data['post'] = data['post'].replace(',','%2c', regex=True)
data['post'] = data['post'].replace('<','%3c', regex=True)
data['post'] = data['post'].replace('>','%3e', regex=True)
data.head()

Unnamed: 0,post,tags
0,""" "" "" "" %2c "" "" . . %2c %2c . . . . %2c .",normal
1,/ _ / / . ? : # _ = @ . @ _ _ %2c @ . . @ ( )...,attack
2,%2c %2c . . . . %2c %2c %2c %2c . %2c .,normal
3,. . . %3c / %3e %3c / %3e . %2c %2c . %2c %2c...,normal
4,/ / / / / - .,normal


In [166]:
tag_num = data['tags'].nunique()
data['tags'].value_counts()

normal    4684
attack     314
Name: tags, dtype: int64

In [167]:
max_words  = 100
tokenizer = text.Tokenizer(num_words=max_words, char_level=False, filters='')

In [168]:
max_len = 50
tokenizer.fit_on_texts(data['post'])
sequences = tokenizer.texts_to_sequences(data['post'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_sec = pad_sequences(sequences, maxlen=max_len)

Found 30 unique tokens.


In [169]:
print(tokenizer.word_index)

{'.': 1, '%2c': 2, '/': 3, '-': 4, '%3e': 5, '%3c': 6, '"': 7, ')': 8, '(': 9, '!': 10, '_': 11, '?': 12, '=': 13, '@': 14, '&': 15, ':': 16, '*': 17, '%': 18, ';': 19, '#': 20, ']': 21, '[': 22, '$': 23, '+': 24, '`': 25, '~': 26, '}': 27, '{': 28, '^': 29, '\n': 30}


In [170]:
import pickle
# save the token data if you want to
with open(token_data, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [171]:
# Split data into train and test
train_size = int(len(data_sec) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data_sec) - train_size))

Train size: 3998
Test size: 1000


In [172]:
x_train = data_sec[:train_size]
x_test = data_sec[train_size:]

test_posts_doc = data['post'][train_size:]

In [173]:
train_tags = data['tags'][:train_size]
test_tags = data['tags'][train_size:]

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [174]:
# save the encoder if you need
with open(encode_data, 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [175]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print(y_train.shape)

(3998, 2)


In [185]:
from keras.layers import LSTM
# Build the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
#model.add(Flatten())
model.add(LSTM(32))
model.add(Dense(tag_num, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 128)           12800     
_________________________________________________________________
lstm_8 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 66        
Total params: 33,474
Trainable params: 33,474
Non-trainable params: 0
_________________________________________________________________


In [177]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 5

In [178]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (3998, 50)
x_test shape: (1000, 50)
y_train shape: (3998, 2)
y_test shape: (1000, 2)


In [179]:
# model.fit trains the model# model. 
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 3998 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [180]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.001624663178808987
Test accuracy: 1.0


In [181]:
#save model if you want to
model.save(model_data)

In [182]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 
for i in range(10):
    print(np.array([x_test[i]]))
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts_doc.iloc[i][:50], "...")
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print('Predicted label: ' + predicted_label + "\n")

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 3 1 1 1 1 1 9
  8 1 1 1 6 3 5 6 3 5 1 4 2 1]]
 %2c / . . . . . ( ) . . . %3c / %3e %3c / %3e . - ...
[[9.3454350e-04 9.9955016e-01]]
Actual label:normal
Predicted label: normal

[[ 0  0  0  0  0  0  0  0  2  7  7  1  2 23  1  2  3  7  7  2  9  8  1  2
   2 12  2  2  9  2  2  8  1  2  1  2  4  2  1  2  4  1  2  1 12  2 12 12
  12 12]]
 %2c " " . %2c $ . %2c / " " %2c ( ) . %2c %2c ? % ...
[[0.00148627 0.9994141 ]]
Actual label:normal
Predicted label: normal

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 3 3 1]]
 / / .  ...
[[5.5009150e-04 9.9969745e-01]]
Actual label:normal
Predicted label: normal

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0 20  1  9 23  2  8  2  2  1  2  2  1  3  1 12
   2 10]]
 # . ( $ %2c ) %2c %2c . %2c %2c . / . ? %2c !  ...
[[0.00160181 0.9994134 ]]
Actual label:normal
Predicted label: normal

[[

In [183]:
request = ''' % { ( # = ' / - ' ) . ( # = @ . @ _ _ ) . ( # _ ? ( # _ = # ) : ( ( # = # [ ' . . . . ' ] ) . ( # = # . ( @ . . . . @ ) ) . ( # . ( ) . ( ) ) . ( # . ( ) . ( ) ) . ( # . ( # ) ) ) ) . ( # = ) . ( # = ( @ . . @ ( ' . ' ) . ( ) . ( ' ' ) ) ) . ( # = ( # ? { ' . ' %2c ' / ' %2c # %2C # ( ) ( ) / / / ( ( ( ( ( ( ( C%2C} : { ' / / ' %2c ' - ' %2c # } ) ) . ( # =  . . ( # ) ) . ( # . ( ) ) . ( # = # . ( ) ) . ( # = ( @ . . . @ ( ) . ( ) ) ) . ( @ . . . . @ ( # . %2c ( ( ( ( ( ( ( @ @ @ @ @ @ # # ( ) ) ) %2c # ) ) . ( # . ( ) ) } 
'''

request = [request]

req_mat = tokenizer.texts_to_sequences(request)
data_sec = pad_sequences(req_mat, maxlen=max_len)
prediction = model.predict(np.array(data_sec))
print(np.array(data_sec))
predicted_label = encoder.classes_[np.argmax(prediction)]
print(prediction)
print(predicted_label)

[[ 8  1  9  8  8  8  1  9 14  1  1  1  1 14  9 20  1  2  9  9  9  9  9  9
   9 14 14 14 14 14 14 20 20  9  8  8  8  2 20  8  8  1  9 20  1  9  8  8
  27 30]]
[[0.853747   0.12233201]]
attack


In [184]:
print(tokenizer.word_index)

{'.': 1, '%2c': 2, '/': 3, '-': 4, '%3e': 5, '%3c': 6, '"': 7, ')': 8, '(': 9, '!': 10, '_': 11, '?': 12, '=': 13, '@': 14, '&': 15, ':': 16, '*': 17, '%': 18, ';': 19, '#': 20, ']': 21, '[': 22, '$': 23, '+': 24, '`': 25, '~': 26, '}': 27, '{': 28, '^': 29, '\n': 30}
